In [None]:
# standard dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

In [None]:
# load data and see whats inthere
chunksize = 10 ** 6
for chunk in pd.read_csv('../input/train.csv', chunksize=chunksize, parse_dates=['pickup_datetime']):
     train = chunk
     break

#train = pd.read_csv('train.csv')
test = pd.read_csv('../input/test.csv')
train.head()


In [None]:
# quick look
plt.hist(train['fare_amount'].values, bins = 100)
plt.xlabel("fare_amount")
plt.ylabel('n of records')

In [None]:
train.describe()

In [None]:
# those should be obvious mistakes, and since the dataset is quite large, removal seems alright
train = train[(train['fare_amount'] > 0) & (train['passenger_count'] > 0)]
train.describe()

In [None]:
# log - different view if the price paid
#train.dropna(inplace=True)
train['log_fair_amount'] = np.log(train['fare_amount']) + 1

plt.hist(train['log_fair_amount'].values, bins = 100)
plt.xlabel("log_fare_amount")
plt.ylabel('n of records')
plt.show()

In [None]:
plt.hist(train['fare_amount'].values, bins = 100)
plt.xlabel("fare_amount")
plt.ylabel('n of records')

In [None]:
plt.hist(train.loc[train['passenger_count'] > 6,'passenger_count'].values)
plt.xlabel('passenger_count')
plt.ylabel('number of records')

In [None]:
train = train[train['passenger_count'] < 6]

In [None]:
train.describe()

In [None]:
train = train[(train['pickup_longitude'] <= -70) & (train['dropoff_longitude'] >= -75)]
train.describe()

In [None]:
train = train[(train['pickup_latitude'] >= -35) & (train['dropoff_latitude'] <= 45)]
train.describe()

In [None]:
train['abs_longtitude_diff'] = (train['pickup_longitude'] - train['dropoff_longitude']).abs()
train['abs_latitude_diff'] = (train['pickup_latitude'] - train['dropoff_latitude']).abs()
train = train[(train['abs_longtitude_diff'] <= 1) & (train['abs_latitude_diff'] <= 1)]
train.describe()

In [None]:
(train.loc[:,['fare_amount', 'passenger_count', 'abs_longtitude_diff', 'abs_latitude_diff']]).corr()

In [None]:
featues = list(train.columns)
features = ['fare_amount', 'abs_longtitude_diff', 'abs_latitude_diff']
X = train[features]
y = X.pop('fare_amount')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# todo: add datetimeinfo
# https://www.quora.com/What-times-are-considered-rush-hour-in-New-York-City
# hour histo

In [None]:
train.columns

In [None]:
train.groupby([ train["pickup_datetime"].dt.hour])['pickup_datetime'].count().plot(kind="bar")

In [None]:
# create "rush-hours coeficient"
train['hour'] = train["pickup_datetime"].dt.hour
train['year'] = train["pickup_datetime"].dt.year
train['rush'] = np.where(train["pickup_datetime"].dt.hour <= 7, 1, np.where(train["pickup_datetime"].dt.hour <= 17, 2,3))

In [None]:
train.corr()

In [None]:
train.plot("pickup_datetime", y='fare_amount')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train['year_standardized'] = scaler.fit_transform(train["pickup_datetime"].dt.year.values.reshape(-1, 1))

In [None]:
train['year_standardized']

In [None]:
featues = list(train.columns)
features = ['fare_amount', 'abs_longtitude_diff', 'abs_latitude_diff', 'year_standardized']
X = train[features]
y = X.pop('fare_amount')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_model.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_model = DecisionTreeRegressor(max_depth=4)
tree_model.fit(X_train, y_train)
tree_model.score(X_test, y_test)

In [None]:
#too slow
#from sklearn.svm import SVR
#model = SVR()
#model.fit(X_train, y_train)
#model.score(X_test, y_test)

In [None]:
from sklearn.ensemble import BaggingRegressor
ensemble_model = BaggingRegressor(tree_model)
ensemble_model.fit(X_train, y_train)
ensemble_model.score(X_test, y_test)

In [None]:
to_predict = pd.read_csv('../input/test.csv', parse_dates=['pickup_datetime'])

In [None]:
to_predict.head()

In [None]:
X_train.head()

In [None]:
to_predict['abs_longtitude_diff'] = (to_predict['pickup_longitude'] - to_predict['dropoff_longitude']).abs()
to_predict['abs_latitude_diff'] = (to_predict['pickup_latitude'] - to_predict['dropoff_latitude']).abs()

In [None]:

to_predict['year_standardized'] = scaler.fit_transform(to_predict["pickup_datetime"].dt.year.values.reshape(-1, 1))
to_predict.drop(["pickup_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count"], axis=1, inplace=True)
to_predict.head()

In [None]:
key = to_predict.pop('key')
predictions = ensemble_model.predict(to_predict)

In [None]:
to_submit = pd.DataFrame({'key': key, 'fare_amount': predictions},
    columns = ['key', 'fare_amount'])

In [None]:
to_submit.to_csv("submission.csv", index=False)
to_submit.head()