In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from math import radians, cos, sin, asin, sqrt

import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap
from folium.plugins import TimestampedGeoJson
from folium.plugins import MarkerCluster
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

os.chdir(r"C:\Users\Bhavesh\Desktop\Data scientist\Project\Cab Fare")
os.getcwd()
dataset = pd.read_csv('train_cab.csv', encoding = "ISO-8859-1",sep=',')        
dataset.head(5)

dataset['fare_amount'] =dataset['fare_amount'].astype(float)
dataset['pickup_datetime']=pd.to_datetime(dataset['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')
dataset['passenger_count'] = pd.to_numeric(dataset['passenger_count'], errors='coerce')
dataset['passenger_count'] =dataset['passenger_count'].astype(int)

dataset[dataset['passenger_count'].isnull()]
dataset[dataset['fare_amount'].isnull()]
dataset = dataset.dropna(subset=['passenger_count'])
dataset = dataset.dropna(subset=['fare_amount'])

dataset.describe()
dataset.dtypes

dataset1 = dataset[((dataset['pickup_longitude'] > -78) & (dataset['pickup_longitude'] < -70)) & ((dataset['dropoff_longitude'] > -78) & (dataset['dropoff_longitude'] < -70)) & ((dataset['pickup_latitude'] > 37) & (dataset['pickup_latitude'] < 45)) & ((dataset['dropoff_latitude'] > 37) & (dataset['dropoff_latitude'] < 45)) & (dataset['passenger_count'] > 0) & (dataset['fare_amount'] >= 2.5)]

dataset1.shape

# plotting histogram for fare amount

plt.figure(figsize = (14, 7))
 n, bins, patches = plt.hist(dataset1['fare_amount'], 10000,facecolor='blue', alpha=0.75)
 plt.xlabel('Fare amount')
 plt.title('Histogram of fare amount')
 plt.xlim(0, 200)
 plt.show();
 

 # plotting KDE plot for fare amount

plt.figure(figsize=(8,5))
sns.kdeplot(np.log(dataset1['fare_amount'].values)).set_title("Distribution of fare amount (log scale)")

# plotting histogram for passenger count
dataset1.groupby('passenger_count').size()
dataset1 = dataset1.loc[dataset1['passenger_count'] <= 6]
dataset1['passenger_count'].value_counts().plot.bar(color = 'b', edgecolor = 'k');
plt.title('Histogram of passenger counts'); plt.xlabel('Passenger counts'); plt.ylabel('Count');

# creating features(varibles) as below from pickup_datetime
dataset1['year'] = dataset.pickup_datetime.dt.year
dataset1['month'] = dataset.pickup_datetime.dt.month
dataset1['day'] = dataset.pickup_datetime.dt.day
dataset1['weekday'] = dataset.pickup_datetime.dt.weekday
dataset1['hour'] = dataset.pickup_datetime.dt.hour
dataset1.head(5)

# calculating the distance of rides and creating feature names as distance
def haversine_np(lon1, lat1, lon2, lat2): 
     
    
 
 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) 
 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
 
 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2 
 
 
    c = 2 * np.arcsin(np.sqrt(a)) 
    km = 6371 * c  # 6371 is Radius of earth in kilometers. Use 3956 for miles 
    return km 


 
dataset1['distance'] =  haversine_np(dataset1['pickup_latitude'], dataset1['pickup_longitude'], dataset1['dropoff_latitude' ], dataset1['dropoff_longitude'])

# plotting the scatter plot for fare amount VS trip distance
yaxix = (0, 500)
plt.scatter(x=dataset1['distance'],y=dataset1['fare_amount'])
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
#dataset1.drop(['distance'], axis=1)
plt.ylim(yaxix)

# plotting the hostogram for ride distance
plt.figure(figsize = (14, 4))
n, bins, patches = plt.hist(dataset1.distance, 1000, facecolor='blue', alpha=0.75)
plt.xlabel('distance')
plt.title('Histogram of ride distance')
plt.show();

# plotting the scatter plot for dropoff's longitude and latitude
city_long_border = (-74.03, -73.85)
city_lat_border = (40.63, 40.85)
dataset1.plot(kind='scatter', x='dropoff_longitude', y='dropoff_latitude',color='red', s=.02, alpha=.6)
plt.title("Dropoffs")
plt.ylim(city_lat_border)
plt.xlim(city_long_border)

# plotting the scatter plot for pickup's longitude and latitude
dataset1.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',color='blue', s=.02, alpha=.6)
plt.title("Pickups")
plt.ylim(city_lat_border)
plt.xlim(city_long_border)

# setting the LA Guardia coordinates finding out the number of pickups and dropoffs for La Guardia
LAGuardia={'min_lng':-73.8895,
     'min_lat':40.7664,
     'max_lng':-73.8550, 
     'max_lat':40.7931}
LA_center=[40.763599,-73.863029]
# Get all pickups to JFK
LA_data=dataset1.loc[(dataset1.pickup_latitude>=LAGuardia['min_lat']) & (dataset1.pickup_latitude<=LAGuardia['max_lat'])]
LA_data=LA_data.loc[(dataset1.pickup_longitude>=LAGuardia['min_lng']) & (dataset1.pickup_longitude<=LAGuardia['max_lng'])]
print("Number of Trips with Pickups from LA Guardia",LA_data.shape[0])
LA_dropoff=dataset1.loc[(dataset1.dropoff_latitude>=LAGuardia['min_lat']) & (dataset1.dropoff_latitude<=LAGuardia['max_lat'])]
LA_dropoff=LA_dropoff.loc[(dataset1.dropoff_longitude>=LAGuardia['min_lng']) & (dataset1.dropoff_longitude<=LAGuardia['max_lng'])]
print("Number of Trips with Dropoffs to LA Guardia",LA_dropoff.shape[0])

# plotting the KDE plot for fare amount for La Guardia pickups and other pickups
plt.figure(figsize=(8,5))
sns.kdeplot(np.log(LA_data['fare_amount'].values),label='La Guardia Pickups')
#sns.kdeplot(np.log(JFK_dropoff['fare_amount'].values),label='JFK Dropoff')
sns.kdeplot(np.log(dataset1['fare_amount'].values),label='All Trips in Train data')
plt.title("Fare Amount Distribution")

# plotting the KDE plot for fare amount for La Guardia dropoffs and other dropoffs
plt.figure(figsize=(8,5))
sns.kdeplot(np.log(LA_dropoff['fare_amount'].values),label='LA Guardia dropoffs')
sns.kdeplot(np.log(dataset1['fare_amount'].values),label='train')
plt.title("Dropoffs vs Fare Amount")

# creating functions for LA Guardia pickups and dropoffs rides and creating features is_pickup_la_guardia and is_dropoff_la_guardia
def isAirport(latitude,longitude,airport_name='LA Guardia'):
    if airport_name=='la guardia':
        boundary={'min_lng':-73.8895, 
                  'min_lat':40.7664, 
                  'max_lng':-73.8550, 
                  'max_lat':40.7931
                 }
    if latitude>=boundary['min_lat'] and latitude<=boundary['max_lat']:
        if longitude>=boundary['min_lng'] and longitude<=boundary['max_lng']:
            return 1
    else:
        return 0
    
nyc_airports={
    'LaGuardia':{'min_lng':-73.8895, 
                  'min_lat':40.7664, 
                  'max_lng':-73.8550, 
                  'max_lat':40.7931
        
    }
    
}    

def isAirport(latitude,longitude,airport_name='LA Guardia'):
    
    if latitude>=nyc_airports[airport_name]['min_lat'] and latitude<=nyc_airports[airport_name]['max_lat'] and longitude>=nyc_airports[airport_name]['min_lng'] and longitude<=nyc_airports[airport_name]['max_lng']:
        return 1
    else:
        return 0

dataset1['is_pickup_la_guardia']=dataset1.apply(lambda row:isAirport(row['pickup_latitude'],row['pickup_longitude'],'LaGuardia'),axis=1)
dataset1['is_dropoff_la_guardia']=dataset1.apply(lambda row:isAirport(row['dropoff_latitude'],row['dropoff_longitude'],'LaGuardia'),axis=1)


# checking the non airports rides
non_airport=dataset1.loc[(dataset1['is_dropoff_la_guardia']==0)]
non_airport=non_airport.loc[(non_airport['is_pickup_la_guardia']==0)]
non_airport.shape

# creating array coordinates for all boroughs in new york

nyc_boroughs={
    'manhattan':{
        'min_lng':-74.0479,
        'min_lat':40.6829,
        'max_lng':-73.9067,
        'max_lat':40.8820
    },
    
    'queens':{
        'min_lng':-73.9630,
        'min_lat':40.5431,
        'max_lng':-73.7004,
        'max_lat':40.8007

    },

    'brooklyn':{
        'min_lng':-74.0421,
        'min_lat':40.5707,
        'max_lng':-73.8334,
        'max_lat':40.7395

    },

    'bronx':{
        'min_lng':-73.9339,
        'min_lat':40.7855,
        'max_lng':-73.7654,
        'max_lat':40.9176

    },

    'staten_island':{
        'min_lng':-74.2558,
        'min_lat':40.4960,
        'max_lng':-74.0522,
        'max_lat':40.6490
    }
}

def getBorough(lat,lng):
    
    locs=nyc_boroughs.keys()
    for loc in locs:
        if lat>=nyc_boroughs[loc]['min_lat'] and lat<=nyc_boroughs[loc]['max_lat'] and lng>=nyc_boroughs[loc]['min_lng'] and lng<=nyc_boroughs[loc]['max_lng']:
            return loc
    return 'others'

# Creating feature for borough as pickup_borough and dropoff_borough

dataset1['pickup_borough']=dataset1.apply(lambda row:getBorough(row['pickup_latitude'],row['pickup_longitude']),axis=1)
dataset1['dropoff_borough']=dataset1.apply(lambda row:getBorough(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)

# Creating countplot for pickup and dropoffs boroughs 
plt.figure(figsize=(8,5))
sns.countplot(x=dataset1['pickup_borough'])
plt.title("Distribution of Pickup Boroughs")'

plt.figure(figsize=(8,5))
sns.countplot(x=dataset1['dropoff_borough'])
plt.title("Distribution of dropoff Boroughs")'


# creating KDE plot Distribution of Fare Amount Across Buroughs
plt.figure(figsize=(16,10))
plt.title("Distribution of Fare Amount Across Buroughs")
i=1
for key in nyc_boroughs.keys():
    plt.subplot(3,2,i)
    sns.kdeplot(np.log(dataset1.loc[dataset1['pickup_borough']==key,'fare_amount'].values),label='Pickup '+ key)
    sns.kdeplot(np.log(dataset1.loc[dataset1['dropoff_borough']==key,'fare_amount'].values),label='Dropoff'+ key).set_title("Fare Amount (log scale) for "+key)
    
    i=i+1
  
 # creating KDE plot Distribution of trip distance Across Buroughs
    
plt.figure(figsize=(24,15))
plt.title("Distribution of Trip Distances Across Buroughs")
i=1
for key in nyc_boroughs.keys():
    plt.subplot(3,2,i)
    sns.kdeplot(np.log(dataset1.loc[dataset1['pickup_borough']==key,'distance'].values),label='Pickup '+ key)
    sns.kdeplot(np.log(dataset1.loc[dataset1['dropoff_borough']==key,'distance'].values),label='Dropoff'+ key).set_title("Trip Distance (log scale) for "+key)
    i=i+1
    

# creating fucting for rides in lower Manhattan    
lower_manhattan_boundary={'min_lng': -74.0194,
                          'min_lat':40.6997,
                          'max_lng':-73.9716,
                          'max_lat':40.7427}

def isLowerManhattan(lat,lng):
    if lat>=lower_manhattan_boundary['min_lat'] and lat<=lower_manhattan_boundary['max_lat'] and lng>=lower_manhattan_boundary['min_lng'] and lng<=lower_manhattan_boundary['max_lng']:
        return 1
    else:
        return 0

# Creating feature as is_pickup_lower_manhattan  and is_dropoff_lower_manhattan
dataset1['is_pickup_lower_manhattan']=dataset1.apply(lambda row:isLowerManhattan(row['pickup_latitude'],row['pickup_longitude']),axis=1)
dataset1['is_dropoff_lower_manhattan']=dataset1.apply(lambda row:isLowerManhattan(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)    
    
#checking the rides in manhattan only
manhattan=dataset1.loc[(dataset1['pickup_borough']=='manhattan') | (dataset1['dropoff_borough']=='manhattan')]
manhattan.shape 

# plotting a Kde plot for  Distribution of pickup Fare Amount - Manhattan vs Lower Manhattan
sns.kdeplot(np.log(manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'fare_amount'].values),label='Lower Manhattan Pickups')
sns.kdeplot(np.log(manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'fare_amount'].values),label='Rest of Manhattan Pickups')
plt.xlabel("fare amount (log)")
plt.title("Distribution of pickup Fare Amount - Manhattan vs Lower Manhattan")

# plotting a Kde plot for  Distribution of dropoffs Fare Amount - Manhattan vs Lower Manhattan

plt.figure(figsize=(8,5))
sns.kdeplot(np.log(manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'fare_amount'].values),label='Lower Manhattan Dropoffs')
sns.kdeplot(np.log(manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'fare_amount'].values),label='Rest of Manhattan Dropoffs')
plt.xlabel("fare amount (log)")
plt.title("Distribution of dropoff Fare Amount - Manhattan vs Lower Manhattan")

# plotting the scatter for Trip Distance vs Fare Amount (Lower Manhattan pickups)
yaxix = (0, 150)
plt.scatter(x=manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'distance'].values,y=manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'fare_amount'].values)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Lower Manhattan pickups)")
plt.ylim(yaxix)

# plotting the scatter for Trip Distance vs Fare Amount (Rest of  Manhattan pickups)
yaxix = (0, 150)
plt.scatter(x=manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'distance'].values,y=manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'fare_amount'].values)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Rest of Manhattan pickups)")
plt.ylim(yaxix)

# plotting the scatter for Trip Distance vs Fare Amount (Lower Manhattan dropoffs)
plt.scatter(x=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'distance'].values,y=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'fare_amount'].values)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Lower Manhattan dropoffs)")

# plotting the scatter for Trip Distance vs Fare Amount (Rest of  Manhattan dropoffs)
plt.scatter(x=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'distance'].values,y=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'fare_amount'].values)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Rest of Manhattan dropoffs)")

# creating a bar plot for num of trips VS year 
dataset1['key'] = dataset1['fare_amount']
trips_year=dataset1.groupby(['year'])['key'].count().reset_index().rename(columns={'key':'Num_Trips'})
trips_year.head()
sns.barplot(x='year',y='Num_Trips',data=trips_year)

# creating a bar plot for num of fare amount VS year 
trips_year_fareamount=dataset1.groupby(['year'])['fare_amount'].mean().reset_index().rename(columns={'fare_amount':'avg_fare_amount'})
sns.barplot(x='year',y='avg_fare_amount',data=trips_year_fareamount).set_title("Avg Fare Amount over Years")
dataset1=dataset1[dataset1['fare_amount']<=200]


# creating a bar plot for num of trips VS month   and  fare amount VS month 
def groupandplot(data,groupby_key,value,aggregate='mean'):
    plt.figure(figsize=(16,10))
    agg_data=data.groupby([groupby_key])[value].agg(aggregate).reset_index().rename(columns={value:aggregate+'_'+value})
    plt.subplot(1,2,1)
    count_data=dataset1.groupby([groupby_key])['key'].count().reset_index().rename(columns={'key':'Num_Trips'})
    sns.barplot(x=groupby_key,y='Num_Trips',data=count_data).set_title("Number of Trips vs "+groupby_key)
    
    plt.subplot(1,2,2)
    sns.barplot(x=groupby_key,y=aggregate+'_'+value,data=agg_data).set_title(aggregate+'_'+value+" vs "+groupby_key)
  

 # creating a bar plot for num of trips VS weekday   and  fare amount VS weekday
groupandplot(dataset1,'weekday','fare_amount')

 # creating a bar plot for num of trips VS hour   and  fare amount VS hour
groupandplot(dataset1,'hour','fare_amount')

 # creating a bar plot for num of trips VS passenger count   and  fare amount VS passenger count 
dataset1=dataset1[dataset1['passenger_count']<=6] 
groupandplot(dataset1,'passenger_count','fare_amount')
dataset1= dataset1.drop(['key'], axis=1)

# saving the cleaned data into train_cleaned file 
dataset1.to_csv("train_cleaned.csv",index=False)

# Liner Regression Model

y = dataset1['fare_amount']
X = dataset1.drop(columns=['fare_amount','pickup_datetime'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LinearRegression()

le = LabelEncoder()
le.fit(X_train['pickup_borough'].astype(str))
X_train['pickup_borough'] = le.transform(X_train['pickup_borough'].astype(str))
X_test['pickup_borough'] = le.transform(X_test['pickup_borough'].astype(str))


le.fit(X_train['dropoff_borough'].astype(str))
X_train['dropoff_borough'] = le.transform(X_train['dropoff_borough'].astype(str))
X_test['dropoff_borough'] = le.transform(X_test['dropoff_borough'].astype(str))
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lm_rmse=np.sqrt(mean_squared_error(y_pred, y_test))
print("RMSE for Linear Regression is ",lm_rmse)

def MAPE(y_true,y_pred):
        mape = np.mean(np.abs((y_true - y_pred)/ y_true))
        return mape
            
def MAE(y_true,y_pred):
        mae = np.mean(np.abs((y_true - y_pred)))
        return mae
    
MAPE(y_test,y_pred)
MAE(y_test,y_pred)
model = sm.OLS(y_train,X_train.astype(float)).fit()
model.summary()

# Decision tree Model

dt  = DecisionTreeRegressor(max_depth=2)
dt.fit(X_train,y_train)

dt_pred= dt.predict(X_test)
dt_rmse=np.sqrt(mean_squared_error(dt_pred, y_test))
print("RMSE for Random Forest is ",dt_rmse)

MAPE(y_test,dt_pred)
MAE(y_test,dt_pred)

# Random Forest Model

rf = RandomForestRegressor(n_estimators = 1000, random_state = 883,n_jobs=-1)
rf.fit(X_train,y_train)

rf_pred= rf.predict(X_test)
rf_rmse=np.sqrt(mean_squared_error(rf_pred, y_test))
print("RMSE for Random Forest is ",rf_rmse)

MAPE(y_test,rf_pred)
MAE(y_test,rf_pred)

    