In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression # Our ML model
from sklearn.preprocessing import LabelEncoder # Preprocess to get float
import numpy as np # Numpy
from geopy.distance import vincenty # To calculate distance
from ast import literal_eval # This is to convert string representation of array to actual array

In [2]:
'''
/usr/local/lib/python3.6/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
'''
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [3]:
df = pd.read_csv('../input/train.csv')

### Analyzing data

What are all columns/features we have?

In [4]:
df.columns

Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE'],
      dtype='object')

Missing feature/value?

In [5]:
df.columns[df.isnull().any()]

Index(['ORIGIN_CALL', 'ORIGIN_STAND'], dtype='object')

Count per feature/column

In [6]:
df.count()

TRIP_ID         1710670
CALL_TYPE       1710670
ORIGIN_CALL      364770
ORIGIN_STAND     806579
TAXI_ID         1710670
TIMESTAMP       1710670
DAY_TYPE        1710670
MISSING_DATA    1710670
POLYLINE        1710670
dtype: int64

Let's see few rows of data ..

In [7]:
sum(df.MISSING_DATA)

10

In [8]:
df.drop(df[df.MISSING_DATA == True].index, inplace=True)
df.count()

TRIP_ID         1710660
CALL_TYPE       1710660
ORIGIN_CALL      364769
ORIGIN_STAND     806576
TAXI_ID         1710660
TIMESTAMP       1710660
DAY_TYPE        1710660
MISSING_DATA    1710660
POLYLINE        1710660
dtype: int64

In [9]:
# I need to calculate distance between starting and ending location,
# so what I'm gonna do is take POLYLINE column, and split it into multiple
# chunks- so that I can load entire column into memory and split it

# Convert string list representation to list
df['POLYLINE'] = df['POLYLINE'].apply(literal_eval)

In [10]:
# We need to calculate distance and remove this POLYLINE 
# Vectorize is fun - https://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.vectorize.html
def distance(polyline):
    try:
        return vincenty(polyline[0], polyline[-1]).miles
    except Exception as e:
        return float('nan')
# Let's see how much time it takes for 1 chunk
#dist_1 = v_dist(polyline_chunks[0])

In [11]:
# Calculate distance using above method
df['DISTANCE'] = df['POLYLINE'].apply(distance)

In [12]:
print(df.DISTANCE.head())
# Drop distances with "NaN"
df.drop(df[df.DISTANCE == float('nan')].index, inplace=True)

0    1.225683
1    1.927831
2    0.206913
3    2.371665
4    2.841243
Name: DISTANCE, dtype: float64


In [13]:
# Calculate label
def trip_time(polyline):
    return (len(polyline) - 1) * 15
label = df['POLYLINE'].apply(trip_time)

In [50]:
# We'll (or we can) do fancy stuff like getting hour of day and classify them as 
# (peak hours, ok hours, easy hour/night time). 
# Get the day of the week
# Get the month of the year
# this all can be done with given timestamp
df['my_dates'] = pd.to_datetime(df['TIMESTAMP'])
df['day_of_week'] = df['my_dates'].dt.dayofweek
df['month_of_year'] = df['my_dates'].dt.month

In [51]:
# Drop polyline, missing data
train = df.drop(['POLYLINE', 'MISSING_DATA', 'ORIGIN_CALL','TAXI_ID', 'TIMESTAMP', 'TRIP_ID', 'my_dates'], 1)
train.columns

Index(['CALL_TYPE', 'ORIGIN_STAND', 'DAY_TYPE', 'DISTANCE', 'day_of_week',
       'month_of_year'],
      dtype='object')

In [52]:
#LabelEncoder
train = train.apply(LabelEncoder().fit_transform)

In [53]:
# Okay, time for training with LinearRegression
lr = LinearRegression()
lr = lr.fit(train, label)

In [54]:
# Test df
test_df = pd.read_csv('../input/test.csv')

In [55]:
test_df.drop(test_df[test_df.MISSING_DATA == True].index, inplace=True)
# I need to calculate distance between starting and ending location,
# so what I'm gonna do is take POLYLINE column, and split it into multiple
# chunks- so that I can load entire column into memory and split it

# Convert string list representation to list
test_df['POLYLINE'] = test_df['POLYLINE'].apply(literal_eval)
# Get distance
test_df['DISTANCE'] = test_df['POLYLINE'].apply(distance)

In [56]:
# Calculating time based on math

test_df.drop(test_df[test_df.DISTANCE == float('nan')].index, inplace=True)
result_math_df = test_df[['TRIP_ID']].copy()
test_label = test_df['POLYLINE'].apply(trip_time)
result_math_df['TRAVEL_TIME'] = test_label
result_math_df.to_csv('result_math_df.csv', index=False)

In [57]:
# New columns - day of the week and month of the year
test_df['my_dates'] = pd.to_datetime(test_df['TIMESTAMP'])
test_df['day_of_week'] = test_df['my_dates'].dt.dayofweek
test_df['month_of_year'] = test_df['my_dates'].dt.month

In [58]:
# Predicting time based on Regression technique 

test = test_df.drop(['POLYLINE', 'MISSING_DATA', 'ORIGIN_CALL','TAXI_ID', 'TIMESTAMP', 'TRIP_ID', 'my_dates'], 1)
test = test.apply(LabelEncoder().fit_transform)
predictions = lr.predict(test)
result_regression = result_math_df[['TRIP_ID']].copy()
result_regression['TRAVEL_TIME'] = pd.Series(predictions, index=result_regression.index)
result_regression.to_csv('result_regression.csv', index=False)