In [26]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression # Our ML model
from sklearn.preprocessing import LabelEncoder # Preprocess to get float
import numpy as np # Numpy
from geopy.distance import vincenty # To calculate distance
from ast import literal_eval # This is to convert string representation of array to actual array

In [116]:
df = pd.read_csv('../input/train.csv')

### Analyzing data

What are all columns/features we have?

In [11]:
df.columns

Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE'],
      dtype='object')

Missing feature/value?

In [12]:
df.columns[df.isnull().any()]

Index(['ORIGIN_CALL', 'ORIGIN_STAND'], dtype='object')

Count per feature/column

In [13]:
df.count()

TRIP_ID         1710670
CALL_TYPE       1710670
ORIGIN_CALL      364770
ORIGIN_STAND     806579
TAXI_ID         1710670
TIMESTAMP       1710670
DAY_TYPE        1710670
MISSING_DATA    1710670
POLYLINE        1710670
dtype: int64

Let's see few rows of data ..

In [15]:
sum(df.MISSING_DATA)

10

In [88]:
df.drop(df[df.MISSING_DATA == True].index, inplace=True)
df.count()

TRIP_ID         1710660
CALL_TYPE       1710660
ORIGIN_CALL      364769
ORIGIN_STAND     806576
TAXI_ID         1710660
TIMESTAMP       1710660
DAY_TYPE        1710660
MISSING_DATA    1710660
POLYLINE        1710660
dtype: int64

In [117]:
# I need to calculate distance between starting and ending location,
# so what I'm gonna do is take POLYLINE column, and split it into multiple
# chunks- so that I can load entire column into memory and split it

# Convert list representation to list
df['POLYLINE'] = df['POLYLINE'].apply(literal_eval)

In [126]:
# We need to calculate distance and remove this POLYLINE 
# Vectorize is fun - https://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.vectorize.html
def distance(polyline):
    try:
        return vincenty(polyline[0], polyline[-1])
    except Exception as e:
        return "NaN"
# Let's see how much time it takes for 1 chunk
#dist_1 = v_dist(polyline_chunks[0])

In [127]:
# Calculate distance using above method
df['DISTANCE'] = df['POLYLINE'].apply(distance)

In [129]:
df.DISTANCE.head()
# Drop distances with "NaN"
df.drop(df[df.DISTANCE == "NaN"].index, inplace=True)

In [131]:
# Calculate label
def trip_time(polyline):
    return (len(polyline) - 1) * 15
df['TRIP_TIME'] = df['POLYLINE'].apply(trip_time)

In [None]:
# TODO
# We'll (or we can) do fancy stuff like getting hour of day and classify them as 
# (peak hours, ok hours, easy hour/night time). 
# Get the day of the week
# Get the month of the year
# this all can be done with given timestamp

In [132]:
train = df.drop('POLYLINE', 1)

In [None]:
# Okay, time for training with LinearRegression