In [7]:
%matplotlib inline

In [99]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime
import re
import math

In [3]:
train_file = "/home/adityasidharta/Desktop/nus_course/data_mining/competition/dataset/train_data.csv"
test_file = "/home/adityasidharta/Desktop/nus_course/data_mining/competition/dataset/test.csv"

In [161]:
def straight_dist(x_start, x_end, y_start, y_end):
    return np.sqrt((x_end - x_start)**2 + (y_end - y_start)**2)

def calc_azt(x_start, x_end, y_start, y_end ):
    return math.degrees(math.atan2(y_end - y_start, x_end - x_start)) + 18

def coordinates_bin(coor):
    return coor // 25 + 21

def convert_ts_to_datetime(ts):
    return datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S')

def get_weekday(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.weekday()

def is_weekend(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.weekday() >= 5

def get_day(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.day

def get_month(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.month

def get_year(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.year

def get_hour(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.hour

def get_minute(ts):
    dt = convert_ts_to_datetime(ts)
    return dt.minute

def time_classification(ts):
    hour = get_hour(ts)
    if hour <= 5:
        return "Midnight"
    if hour <= 8
        return "Morning"
    if hour <= 11:
        return "Noon"
    if hour <= 18:
        return "Afternoon"
    if hour <= 20:
        return "Night"
    else:
        return "LateNight"

def taxi_id_bucket(taxi_id):
    return taxi_id // 10

SyntaxError: invalid syntax (<ipython-input-161-04d37c5a1977>, line 45)

In [162]:
vec_straight_dist = np.vectorize(straight_dist)
vec_calc_azt = np.vectorize(calc_azt)
vec_coordinates_bin = np.vectorize(coordinates_bin)
vec_get_weekday = np.vectorize(get_weekday)
vec_is_weekend = np.vectorize(is_weekend)
vec_get_day = np.vectorize(get_day)
vec_get_month = np.vectorize(get_month)
vec_get_year = np.vectorize(get_year)
vec_get_hour = np.vectorize(get_hour)
vec_get_minute = np.vectorize(get_minute)
vec_time_classification = np.vectorize(time_classification)
vec_taxi_id_bucket = np.vectorize(taxi_id_bucket)

In [163]:
train_data.columns

Index([u'ID', u'TAXI_ID', u'TIMESTAMP', u'DURATION', u'X_START', u'Y_START',
       u'X_END', u'Y_END', u'X_TRAJECTORY', u'Y_TRAJECTORY', u'TRAJ_LENGTH'],
      dtype='object')

In [164]:
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [165]:
def preprocessing_train(df_train):
    df_train = df_train.drop(['X_TRAJECTORY','Y_TRAJECTORY'], axis = 1)
    df_train['LOG_DURATION'] = np.log(df_train['DURATION'])
    df_train['LOG_TRAJ_LENGTH'] = np.log(df_train['TRAJ_LENGTH'])
    df_train['LOG_STRAIGHT_DIST'] = np.log(vec_straight_dist(df_train['X_START'], df_train['X_END'], df_train['Y_START'], df_train['Y_END']))
    df_train['AZT'] = vec_calc_azt(df_train['X_START'], df_train['X_END'], df_train['Y_START'], df_train['Y_END'])
    df_train['X_START_BIN'] = vec_coordinates_bin(df_train['X_START'])
    df_train['X_END_BIN'] = vec_coordinates_bin(df_train['X_END'])
    df_train['Y_START_BIN'] = vec_coordinates_bin(df_train['Y_START'])
    df_train['Y_END_BIN'] = vec_coordinates_bin(df_train['Y_END'])
    df_train['WEEKDAY'] = vec_get_weekday(df_train['TIMESTAMP'])
    df_train['ISWEEKEND'] = vec_is_weekend(df_train['TIMESTAMP'])
    df_train['DAY'] = vec_get_day(df_train['TIMESTAMP'])
    df_train['MONTH'] = vec_get_month(df_train['TIMESTAMP'])
    df_train['YEAR'] = vec_get_year(df_train['TIMESTAMP'])
    df_train['HOUR'] = vec_get_hour(df_train['TIMESTAMP'])
    df_train['MINUTE'] = vec_get_minute(df_train['TIMESTAMP'])
    df_train['TIME_CLASS'] = vec_time_classification(df_train['TIMESTAMP'])
    df_train['TAXI_ID_BIN'] = vec_taxi_id_bucket(df_train['TAXI_ID'])
    return df_train

def preprocessing_test(df_test):
    df_test['LOG_STRAIGHT_DIST'] = np.log(vec_straight_dist(df_test['X_START'], df_test['X_END'], df_test['Y_START'], df_test['Y_END']))
    df_test['AZT'] = vec_calc_azt(df_test['X_START'], df_test['X_END'], df_test['Y_START'], df_test['Y_END'])
    df_test['X_START_BIN'] = vec_coordinates_bin(df_test['X_START'])
    df_test['X_END_BIN'] = vec_coordinates_bin(df_test['X_END'])
    df_test['Y_START_BIN'] = vec_coordinates_bin(df_test['Y_START'])
    df_test['Y_END_BIN'] = vec_coordinates_bin(df_test['Y_END'])
    df_test['WEEKDAY'] = vec_get_weekday(df_test['TIMESTAMP'])
    df_test['ISWEEKEND'] = vec_is_weekend(df_test['TIMESTAMP'])
    df_test['DAY'] = vec_get_day(df_test['TIMESTAMP'])
    df_test['MONTH'] = vec_get_month(df_test['TIMESTAMP'])
    df_test['YEAR'] = vec_get_year(df_test['TIMESTAMP'])
    df_test['HOUR'] = vec_get_hour(df_test['TIMESTAMP'])
    df_test['MINUTE'] = vec_get_minute(df_test['TIMESTAMP'])
    df_test['TIME_CLASS'] = vec_time_classification(df_test['TIMESTAMP'])
    df_test['TAXI_ID_BIN'] = vec_taxi_id_bucket(df_test['TAXI_ID'])
    return df_test

In [166]:
def dummify(df):
    columns = ['TAXI_ID', 'X_START_BIN', 'X_END_BIN', 'Y_START_BIN', 'Y_END_BIN', 'WEEKDAY', 'ISWEEKEND', 'DAY', 'MONTH', 'YEAR',
              'HOUR', 'MINUTE', 'TIME_CLASS', 'TAXI_ID_BIN']
    pd.get_dummies(df, columns=columns, prefix=columns)

In [167]:
processed_train_df = preprocessing_train(df_train)
dummy_train_df = dummify(processed_train_df)

MemoryError: 