## Project Title: Predict Taxi Trip duration
#### Project Completed by: Anubha Sharma
#### Under Capabl Data Science Summer Internship

### Step1: Import the necessary Libraries

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## Step2: Load the training and test data

In [24]:
train_data=pd.read_csv("train.csv",parse_dates=["pickup_datetime" ,"dropoff_datetime"])
train_data.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [25]:
test_data=pd.read_csv("test.csv",parse_dates=['pickup_datetime'])
test_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


## Step3: Data Exploration

In [26]:
def _create_summary_table(train_data):
    summary_table = pd.DataFrame(index =train_data.columns)
    summary_table['types'] = train_data.dtypes
    summary_table['unique_values'] = train_data.apply(lambda col: len(col.unique()))
    summary_table['pct_unique_value']= summary_table['unique_values'] / train_data.shape[0]
    summary_table['nan_values'] = train_data.apply(lambda col: col.isna().sum())
    stats=train_data.describe(include='all').T
    required_columns= ['min','max','mean','std']
    summary_table[required_columns]=stats[required_columns]

    return summary_table

### Data Exploration of Train Data

In [None]:
_create_summary_table(train_data)

### Data Exploration of Test Data

In [None]:
_create_summary_table(test_data)

In [None]:
train_data=train_data.drop(['dropoff_datetime'], axis=1)

## Step4: Exploratory Data Analysis

In [None]:
sns.scatterplot(data=train_data,
               x='dropoff_longitude',
               y='trip_duration',
               alpha=0.7);
plt.title('Dropoff Longitude vs Trip Duration')

In [None]:
sns.barplot(x='passenger_count', y='trip_duration', hue='store_and_fwd_flag', data = train_data)

In [None]:
fig,  (ax1, ax2,ax3) = plt.subplots(1, 3,figsize=(12,8))
hourly_count = train_data.groupby(train_data['pickup_datetime'].dt.hour).trip_duration.count().reset_index()
weekly_count = train_data.groupby(train_data['pickup_datetime'].dt.dayofweek).trip_duration.count().reset_index()
monthly_count=train_data.groupby(train_data['pickup_datetime'].dt.to_period('M')).trip_duration.count().reset_index()

ax1.set_ylabel('count')
ax1.set_xlabel('Hour of the Day')
ax1.bar(x =hourly_count.pickup_datetime ,height = hourly_count.trip_duration,color='green')

ax2.set_xlabel('Day of the week')
ax2.bar(x =weekly_count.pickup_datetime ,height = weekly_count.trip_duration,color='blue')

ax3.bar(x =monthly_count.pickup_datetime.dt.month ,height = monthly_count.trip_duration,color='gold')
ax3.set_xlabel('Month of the year')

## Step5: Data Preprocessing

### Data Labeling and MinMaxScaler

### Label Encoding of Train Data

In [None]:
lc=LabelEncoder()
train_data['id']=lc.fit_transform(train_data['id'])
train_data['store_and_fwd_flag']=lc.fit_transform(train_data['store_and_fwd_flag'])
train_data['vendor_id']=lc.fit_transform(train_data['vendor_id'])
train_data['pickup_datetime']=lc.fit_transform(train_data['pickup_datetime'])

### Label Encoding of Test Data

In [None]:
test_data['id']=lc.fit_transform(test_data['id'])
test_data['store_and_fwd_flag']=lc.fit_transform(test_data['store_and_fwd_flag'])
test_data['vendor_id']=lc.fit_transform(test_data['vendor_id'])
test_data['pickup_datetime']=lc.fit_transform(test_data['pickup_datetime'])

### Heat Map

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
sns.heatmap(train_data.corr(), cmap='Blues', annot=True, linewidth=.1, ax=ax)

In [None]:
scaler=MinMaxScaler()
train_data= scaler.fit_transform(train_data)
test_data=scaler.fit_transform(test_data)

In [None]:
train_data=pd.DataFrame(train_data)
train_data.head()

In [None]:
test_data=pd.DataFrame(test_data)
test_data.head()

## Step6: Model Building

In [None]:
#6.1 Spliting Data into Training and Test Dataset
X=train_data.drop([9], axis=1)
y=train_data[9]
print(X.head())
y.head()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
#6.2 Using the Regression Model
lr=LinearRegression()
#6.3 Fitting the Model
lr.fit(X_train,y_train)

In [None]:
#6.4 Making the required Predictions
y_pred=lr.predict(X_test)
y_pred.reshape(-1,1)

In [None]:
#6.5 Evaluting the Model
mse=mean_squared_error(y_test,y_pred)
print('The Mean Squared Error is ',mse)
mae=mean_absolute_error(y_test,y_pred)
print('The Mean Absolute Error is ',mae)
r2=r2_score(y_test,y_pred)
print('The R2 Score is ',r2)

## Making Predictions on the Test Data

In [None]:
#Making the Predictions on Test data
test_prediction=lr.predict(test_data)

In [None]:
test_data['Predictions']=test_prediction
test_data