In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime
import re
import math

In [4]:
train_file = "C:\\Users\\AdityaS\\Desktop\\data_mining_competition\\train_data.csv"
test_file = "C:\\Users\\AdityaS\\Desktop\\data_mining_competition\\test.csv"

In [5]:
def straight_dist(x_start, x_end, y_start, y_end):
    return np.sqrt((x_end - x_start)**2 + (y_end - y_start)**2)

def calc_azt(x_start, x_end, y_start, y_end ):
    return math.degrees(math.atan2(y_end - y_start, x_end - x_start))

def coordinates_bin(coor):
    return coor // 50 + 21

def convert_ts_to_datetime(ts):
    return datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S')

def get_weekday(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.weekday()

def is_weekend(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.weekday() >= 5

def get_day(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.day

def get_month(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.month

def get_year(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.year

def get_hour(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.hour

def get_minute(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.minute // 10

def time_classification(ts):
    hour = get_hour(ts)
    if hour <= 5:
        return "Midnight"
    if hour <= 8:
        return "Morning"
    if hour <= 11:
        return "Noon"
    if hour <= 18:
        return "Afternoon"
    if hour <= 20:
        return "Night"
    else:
        return "LateNight"

def taxi_id_bucket(taxi_id):
    return taxi_id // 10

In [6]:
vec_straight_dist = np.vectorize(straight_dist)
vec_calc_azt = np.vectorize(calc_azt)
vec_coordinates_bin = np.vectorize(coordinates_bin)
vec_get_weekday = np.vectorize(get_weekday)
vec_is_weekend = np.vectorize(is_weekend)
vec_get_day = np.vectorize(get_day)
vec_get_month = np.vectorize(get_month)
vec_get_year = np.vectorize(get_year)
vec_get_hour = np.vectorize(get_hour)
vec_get_minute = np.vectorize(get_minute)
vec_time_classification = np.vectorize(time_classification)
vec_taxi_id_bucket = np.vectorize(taxi_id_bucket)

In [7]:
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [12]:
def preprocessing_train(df_train):
    df_train = df_train.drop(['X_TRAJECTORY','Y_TRAJECTORY'], axis = 1)
    df_train['LOG_DURATION'] = np.log(df_train['DURATION'])
    df_train['LOG_TRAJ_LENGTH'] = np.log(df_train['TRAJ_LENGTH'])
    df_train['LOG_STRAIGHT_DIST'] = np.log(vec_straight_dist(df_train['X_START'], df_train['X_END'], df_train['Y_START'], df_train['Y_END']))
    df_train['AZT'] = vec_calc_azt(df_train['X_START'], df_train['X_END'], df_train['Y_START'], df_train['Y_END'])
    df_train['X_START_BIN'] = vec_coordinates_bin(df_train['X_START'])
    df_train['X_END_BIN'] = vec_coordinates_bin(df_train['X_END'])
    df_train['Y_START_BIN'] = vec_coordinates_bin(df_train['Y_START'])
    df_train['Y_END_BIN'] = vec_coordinates_bin(df_train['Y_END'])
    df_train['WEEKDAY'] = vec_get_weekday(df_train['TIMESTAMP'])
    df_train['ISWEEKEND'] = vec_is_weekend(df_train['TIMESTAMP'])
    df_train['DAY'] = vec_get_day(df_train['TIMESTAMP'])
    df_train['MONTH'] = vec_get_month(df_train['TIMESTAMP'])
    df_train['YEAR'] = vec_get_year(df_train['TIMESTAMP'])
    df_train['HOUR'] = vec_get_hour(df_train['TIMESTAMP'])
    df_train['MINUTE'] = vec_get_minute(df_train['TIMESTAMP'])
    df_train['TIME_CLASS'] = vec_time_classification(df_train['TIMESTAMP'])
    df_train['TAXI_ID_BIN'] = vec_taxi_id_bucket(df_train['TAXI_ID'])
    return df_train

def postprocessing_train(df_train):
    log_duration = df_train['LOG_DURATION'].values
    log_traj_length = df_train['LOG_TRAJ_LENGTH'].values
    return log_duration, log_traj_length, df_train.drop(['ID','TIMESTAMP','DURATION', 'TAXI_ID',
                         'X_START', 'Y_START', 'X_END', 'Y_END',
                         'TRAJ_LENGTH', 'LOG_DURATION', 'LOG_TRAJ_LENGTH'], axis = 1)

def preprocessing_test(df_test):
    df_test['LOG_STRAIGHT_DIST'] = np.log(vec_straight_dist(df_test['X_START'], df_test['X_END'], df_test['Y_START'], df_test['Y_END']))
    df_test['AZT'] = vec_calc_azt(df_test['X_START'], df_test['X_END'], df_test['Y_START'], df_test['Y_END'])
    df_test['X_START_BIN'] = vec_coordinates_bin(df_test['X_START'])
    df_test['X_END_BIN'] = vec_coordinates_bin(df_test['X_END'])
    df_test['Y_START_BIN'] = vec_coordinates_bin(df_test['Y_START'])
    df_test['Y_END_BIN'] = vec_coordinates_bin(df_test['Y_END'])
    df_test['WEEKDAY'] = vec_get_weekday(df_test['TIMESTAMP'])
    df_test['ISWEEKEND'] = vec_is_weekend(df_test['TIMESTAMP'])
    df_test['DAY'] = vec_get_day(df_test['TIMESTAMP'])
    df_test['MONTH'] = vec_get_month(df_test['TIMESTAMP'])
    df_test['YEAR'] = vec_get_year(df_test['TIMESTAMP'])
    df_test['HOUR'] = vec_get_hour(df_test['TIMESTAMP'])
    df_test['MINUTE'] = vec_get_minute(df_test['TIMESTAMP'])
    df_test['TIME_CLASS'] = vec_time_classification(df_test['TIMESTAMP'])
    df_test['TAXI_ID_BIN'] = vec_taxi_id_bucket(df_test['TAXI_ID'])
    return df_test.drop(['ID','TIMESTAMP', 'TAXI_ID',
                         'X_START', 'Y_START', 'X_END', 'Y_END'], axis = 1)

In [13]:
def dummify(df):
    columns = ['X_START_BIN', 'X_END_BIN', 'Y_START_BIN', 'Y_END_BIN', 'WEEKDAY', 'ISWEEKEND', 'DAY', 'MONTH', 'YEAR',
              'HOUR', 'MINUTE', 'TIME_CLASS', 'TAXI_ID_BIN']
    return pd.get_dummies(df, columns=columns, prefix=columns)

In [14]:
processed_train_df = preprocessing_train(df_train)
log_duration, log_traj_length, processed_train_df = postprocessing_train(processed_train_df)
processed_test_df = preprocessing_test(df_test)

In [15]:
print processed_train_df.columns
print processed_test_df.columns
print processed_train_df.shape
print processed_train_df.shape

Index([u'LOG_STRAIGHT_DIST', u'AZT', u'X_START_BIN', u'X_END_BIN',
       u'Y_START_BIN', u'Y_END_BIN', u'WEEKDAY', u'ISWEEKEND', u'DAY',
       u'MONTH', u'YEAR', u'HOUR', u'MINUTE', u'TIME_CLASS', u'TAXI_ID_BIN'],
      dtype='object')
Index([u'LOG_STRAIGHT_DIST', u'AZT', u'X_START_BIN', u'X_END_BIN',
       u'Y_START_BIN', u'Y_END_BIN', u'WEEKDAY', u'ISWEEKEND', u'DAY',
       u'MONTH', u'YEAR', u'HOUR', u'MINUTE', u'TIME_CLASS', u'TAXI_ID_BIN'],
      dtype='object')
(465172, 15)
(465172, 15)


In [16]:
concat_df = pd.concat((processed_train_df, processed_test_df))
concat_df = dummify(concat_df)
concat_df.reindex_axis(sorted(concat_df.columns), axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,AZT,DAY_1,DAY_10,DAY_11,DAY_12,DAY_13,DAY_14,DAY_15,DAY_16,DAY_17,...,Y_END_BIN_22,Y_START_BIN_12,Y_START_BIN_13,Y_START_BIN_14,Y_START_BIN_15,Y_START_BIN_16,Y_START_BIN_17,Y_START_BIN_18,Y_START_BIN_19,Y_START_BIN_20
0,72.342272,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,-107.487997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,104.620874,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,122.471192,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,-54.223922,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,-7.298639,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,7.666804,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,121.108809,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,-87.397438,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
9,14.470294,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [17]:
column_names = sorted(concat_df.columns)
column_names

['AZT',
 'DAY_1',
 'DAY_10',
 'DAY_11',
 'DAY_12',
 'DAY_13',
 'DAY_14',
 'DAY_15',
 'DAY_16',
 'DAY_17',
 'DAY_18',
 'DAY_19',
 'DAY_2',
 'DAY_20',
 'DAY_21',
 'DAY_22',
 'DAY_23',
 'DAY_24',
 'DAY_25',
 'DAY_26',
 'DAY_27',
 'DAY_28',
 'DAY_29',
 'DAY_3',
 'DAY_30',
 'DAY_31',
 'DAY_4',
 'DAY_5',
 'DAY_6',
 'DAY_7',
 'DAY_8',
 'DAY_9',
 'HOUR_0',
 'HOUR_1',
 'HOUR_10',
 'HOUR_11',
 'HOUR_12',
 'HOUR_13',
 'HOUR_14',
 'HOUR_15',
 'HOUR_16',
 'HOUR_17',
 'HOUR_18',
 'HOUR_19',
 'HOUR_2',
 'HOUR_20',
 'HOUR_21',
 'HOUR_22',
 'HOUR_23',
 'HOUR_3',
 'HOUR_4',
 'HOUR_5',
 'HOUR_6',
 'HOUR_7',
 'HOUR_8',
 'HOUR_9',
 'ISWEEKEND_False',
 'ISWEEKEND_True',
 'LOG_STRAIGHT_DIST',
 'MINUTE_0',
 'MINUTE_1',
 'MINUTE_2',
 'MINUTE_3',
 'MINUTE_4',
 'MINUTE_5',
 'MONTH_1',
 'MONTH_10',
 'MONTH_11',
 'MONTH_12',
 'MONTH_2',
 'MONTH_3',
 'MONTH_4',
 'MONTH_5',
 'MONTH_6',
 'MONTH_7',
 'MONTH_8',
 'MONTH_9',
 'TAXI_ID_BIN_0',
 'TAXI_ID_BIN_1',
 'TAXI_ID_BIN_10',
 'TAXI_ID_BIN_11',
 'TAXI_ID_BIN_12',
 'T

In [18]:
dum_train_df = concat_df.iloc[:processed_train_df.shape[0]]
dum_test_df = concat_df.iloc[processed_train_df.shape[0]:]

In [19]:
print dum_train_df.shape
print dum_test_df.shape
print log_duration.shape
print log_traj_length.shape

(465172, 219)
(465172, 219)
(465172L,)
(465172L,)


In [27]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, ElasticNetCV, LassoCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(dum_train_df, log_duration, test_size=0.20)

In [21]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(372137, 219)
(93035, 219)
(372137L,)
(93035L,)


In [22]:
model_elastic = ElasticNet(random_state=0)

In [23]:
model_elastic.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
y_pred = model_elastic.predict(X_test)

In [25]:
mean_squared_error(y_test, y_pred)

0.1456805453232119

In [None]:
for x in 