In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

## **<span style = 'color:green'>3. Load the dataset</span>**<a id ="Data"></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

In [3]:
df_raw = pd.read_csv('./TaxiFare.csv')
df_cols = ['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude','dropoff_latitude', 'passenger_count']
df_raw.columns = df_cols
# see modified dataframe
df_raw.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
df, df_test = train_test_split(df_raw, test_size=0.2, random_state=42)
df.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## **<span style = 'color:green'>4. Data Wrangling</span>**<a name ='Wrangling'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)
### **<span style = 'color:brown'>File structure and content</span>**

In [5]:
df.shape

(40000, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                40000 non-null  object 
 1   fare_amount        40000 non-null  float64
 2   pickup_datetime    40000 non-null  object 
 3   pickup_longitude   40000 non-null  float64
 4   pickup_latitude    40000 non-null  float64
 5   dropoff_longitude  40000 non-null  float64
 6   dropoff_latitude   40000 non-null  float64
 7   passenger_count    40000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 2.4+ MB


In [7]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,11.376807,-72.506292,39.933618,-72.496197,39.920941,1.6697
std,9.69634,10.40465,6.28011,10.435716,6.028609,1.291044
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.99206,40.734843,-73.991172,40.734309,1.0
50%,8.5,-73.981848,40.752707,-73.980055,40.753459,1.0
75%,12.5,-73.967092,40.767421,-73.963501,40.76825,2.0
max,200.0,40.783472,401.083332,40.802437,43.41519,6.0


### **<span style = 'color:brown'>Checking for Missing Values</span>**

In [8]:
df.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [9]:
df_test.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [10]:
df.nunique()

key                   3589
fare_amount            649
pickup_datetime      39704
pickup_longitude     28717
pickup_latitude      30737
dropoff_longitude    29608
dropoff_latitude     31410
passenger_count          7
dtype: int64

### **<span style="font-family: Segoe UI; font-size:1.0em;color:brown;">Checking for duplicates</span>**

In [11]:
df.duplicated().sum()

0

In [12]:
df.dropna(axis=0, inplace=True)
np.sum(pd.isnull(df))

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [13]:
# Setting minimum fare amount to zero.
df['fare_amount'][df['fare_amount']<0] = 0.1
df[df['fare_amount']<0]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


## **<span style = 'color:green'>5. Feature Engineering</span>**<a id ='Engineering'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

There are a variety of features within the dataset and it is important to convert them into the right format such that we can analyse them easily. This would include converting datetime features and string features. As we can only feed numeric features as input to our models, our next task is to convert the features in numeric form.  Feature engineering is the process of extracting information from the existing data in order to improve the performance of the model. Feature engineering is subdivided into two parts: Feature prepocessing & Feature generation

### **<span style="font-family: Segoe UI; font-size:1.0em;color:brown;">5.2 Feature Preprocessing</span>**
Feature preprocessing implies updating or transforming existing data.
#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Feature Transformation of pickup_datetime data type  to datetime</span>**
By Default all datetime based columns are considered as strings in pandas. Convert string date to datetime features.

In [14]:
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df_test['pickup_datetime'] = pd.to_datetime(df_test.pickup_datetime)

### **<span style="font-family: Segoe UI; font-size:1.0em;color:brown;">5.1 Feature Generation</span>**<a id ='Feature Generation'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)
Feature generation involves creating new features from the existing data. New variables can be created as follows:

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Datetime features</span>**
1. Pickup hour - from pickup_datetime
2. Pickup week day name
3. Pickup date
4. Pickup month
5. Pickup day of week in numbers.

In [15]:
df.loc[:, 'pickup_hour'] = df['pickup_datetime'].dt.hour
df.loc[:, 'pickup_weekday'] = df['pickup_datetime'].dt.day_name()
df.loc[:, 'pickup_date'] = df['pickup_datetime'].dt.day
df.loc[:, 'pickup_month'] = df['pickup_datetime'].dt.month
df.loc[:, 'pickup_day'] = df['pickup_datetime'].dt.dayofweek
df_test.loc[:, 'pickup_hour'] = df_test['pickup_datetime'].dt.hour
df_test.loc[:, 'pickup_weekday'] = df_test['pickup_datetime'].dt.day_name()
df_test.loc[:, 'pickup_date'] = df_test['pickup_datetime'].dt.day
df_test.loc[:, 'pickup_month'] = df_test['pickup_datetime'].dt.month
df_test.loc[:, 'pickup_day'] = df_test['pickup_datetime'].dt.dayofweek

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Base Fare </span>**
[Wikipedia](https://en.wikipedia.org/wiki/Taxis_of_the_United_States#New_York_City) states that as of June 2006, fares begin at _$_ 2.50, (3.00 after 8:00 p.m., and \$3.50 during the peak weekday hours of 4:00 - 8:00 pm). Base fare is estimated based on these slabs of time range.  

In [16]:
def baseFare(x):
    if x in range(16,20):
        base_fare = 3.50
    elif x in range(20,24):
        base_fare = 3
    else:
        base_fare = 2.50
    return base_fare

df['base_fare'] = df['pickup_hour'].apply(baseFare)
df_test['base_fare'] = df_test['pickup_hour'].apply(baseFare)
df['base_fare'], df['pickup_hour']

(0        3.5
 1        2.5
 2        3.5
 3        2.5
 4        3.5
         ... 
 39995    2.5
 39996    3.0
 39997    3.0
 39998    2.5
 39999    2.5
 Name: base_fare, Length: 40000, dtype: float64,
 0        17
 1         9
 2        16
 3         9
 4        17
          ..
 39995    11
 39996    22
 39997    22
 39998     1
 39999     7
 Name: pickup_hour, Length: 40000, dtype: int64)

In [17]:
df['fare'] = df['fare_amount'] - df['base_fare']

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Haversine Distance</span>**
Haversine distance: To calculate the distance (km) between pickup and dropoff points. Difference between pickup and dropoff points will give an idea about the distances covered which should be the most predictive feature for taxi fare.  The haversine formula determines the great-circle distance between two points on a sphere given their longitudes and latitudes.

The haversine formula approximates the great-circle distance between two points on a sphere given their longitudes, latitudes and the sphere’s radius. The sphere we are interested in here is the Earth – which is not a perfect sphere, but close enough for the approximations that we are interested in. Important in navigation, it is a special case of a more general formula in spherical trigonometry, the law of haversines, that relates the sides and angles of spherical triangles.
Source: [Haversine formula](https://en.wikipedia.org/wiki/Haversine_formula)

Haversine distance can be found using geopy library, scikitleran library, or by implementing Haversine formula by defining a custom made function. All three methods are described below.  

**Calculating the Haversine distance using geopy**

In [18]:
from geopy.distance import great_circle
coordA=(df['pickup_latitude'][0], df['pickup_longitude'][0])
coordB=(df['dropoff_latitude'][0], df['dropoff_longitude'][0])
print (int(great_circle(coordA, coordB).kilometers))

2


**Calculating the Haversine distance by defining a custom made function**

Another method is by defining a function to implement the Haversine formula in Python. Latitude and longitude need to be in radians for calculation.
In the function “haversineDistanceInKM”, first the decimal degrees are converted to radians. The return statement is a somewhat compressed version of the haversine formula implemented in python. “12734” is an approximate diameter of the earth in kilometers.

Average Earth radius = 6371 km

In [19]:
#Method 1: haversineDistanceInKM
from math import radians, cos, sin, asin, sqrt
def haversineDistanceInKM(latA, lonA, latB, lonB):
    lonA, latA, lonB, latB = map(radians, [lonA, latA, lonB, latB])
    return int(12734 * asin(sqrt(
      sin((latB-latA)/2)**2+cos(latA)*cos(latB)*sin((lonB-lonA)/2)**2)))


latA = df['pickup_latitude'][0]
lonA = df['pickup_longitude'][0]
# Yankee stadium homeplate
latB = df['dropoff_latitude'][0]
lonB = df['dropoff_longitude'][0]
print(haversineDistanceInKM(latA, lonA, latB, lonB))

2


In [20]:
#Method 2: haversine_distance
def haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

df['haversine_distance'] = haversine_distance(df['pickup_latitude'].values,
                                                     df['pickup_longitude'].values,
                                                     df['dropoff_latitude'].values,
                                                     df['dropoff_longitude'].values)
df_test['haversine_distance'] = haversine_distance(df_test['pickup_latitude'].values,
                                                     df_test['pickup_longitude'].values,
                                                     df_test['dropoff_latitude'].values,
                                                     df_test['dropoff_longitude'].values)

In [21]:
df['haversine_distance'].median(), df['haversine_distance'].mean(),

(2.1173775621133846, 18.44755651118968)

In [22]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_hour,pickup_weekday,pickup_date,pickup_month,pickup_day,base_fare,fare,haversine_distance
0,04:00.0,10.0,2013-07-27 17:04:00+00:00,-73.974332,40.791427,-73.979032,40.766365,5,17,Saturday,27,7,5,3.5,6.5,2.814726
1,26:00.0,4.0,2013-01-08 09:26:00+00:00,-73.973657,40.751632,-73.969945,40.756702,5,9,Tuesday,8,1,1,2.5,1.5,0.644659
2,45:00.0,6.9,2012-03-17 16:45:00+00:00,-73.975263,40.75228,-73.995098,40.7375,4,16,Saturday,17,3,5,3.5,3.4,2.343742
3,01:17.0,7.7,2012-06-08 09:01:17+00:00,-73.983034,40.766784,-73.971944,40.789289,1,9,Friday,8,6,4,2.5,5.2,2.670991
4,30:49.0,4.5,2015-06-22 17:30:49+00:00,-73.986717,40.771648,-73.98214,40.770699,1,17,Monday,22,6,0,3.5,1.0,0.399692


**Calculating the Haversine Distance using sklearn.neighbors**

In [None]:
import sklearn.neighbors
dist = sklearn.neighbors.DistanceMetric.get_metric('haversine')
dist_miles = (dist.pairwise
    (np.radians(df[['pickup_latitude', 'pickup_longitude']]),
     np.radians(df[['dropoff_latitude','dropoff_longitude']]))*3959)
# Note that 3959 is the radius of the earth in miles
dist_km = (dist.pairwise
    (np.radians(df[['pickup_latitude', 'pickup_longitude']]),
     np.radians(df[['dropoff_latitude','dropoff_longitude']]))*6371)
df_dist_km = pd.DataFrame(dist_km)
df_dist_km.head()

**Calculating the Haversine distance using sklearn.metrics**

In [None]:
from sklearn.metrics.pairwise import haversine_distances
pickup_in_radians = np.radians(df[['pickup_latitude', 'pickup_longitude']])
dropoff_in_radians = np.radians(df[['dropoff_latitude','dropoff_longitude']])
result = pd.DataFrame(haversine_distances(pickup_in_radians, dropoff_in_radians)*6371)
result.head()

Extract the values in the cells on the diagonal from top-left to bottom-right of matrix using numpy.matrix

In [None]:
mydiagonal = np.matrix.diagonal(np.array(result))
distance = pd.DataFrame(mydiagonal, index = df.index, columns = ['distance'])
distance.head()

## **<span style = 'color:green'>6. Exploratory Data Analysis (EDA)</span>**<a id ='EDA'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

In [None]:
# Datetime features
plt.figure(figsize=(22, 6))

# Hour of day
plt.subplot(221)
sb.countplot(df['pickup_hour'])
plt.xlabel('Hour of Day')
plt.ylabel('Total number of pickups')
plt.title('Hourly Variation of Total number of pickups')

# Date
plt.subplot(223)
sb.countplot(df['pickup_date'])
plt.xlabel('Date')
plt.ylabel('Total number of pickups')
plt.title('Daily Variation of Total number of pickups')

# Day of week
plt.subplot(222)
sb.countplot(df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.xlabel('Week Day')
plt.ylabel('Total Number of pickups')
plt.title('Weekly Variation of Total number of pickups')

# Month
plt.subplot(224)
sb.countplot(df['pickup_month'])
plt.xlabel('Month')
plt.ylabel('Total number of pickups')
plt.title('Monthly Variation of Total number of pickups');

**Observations:**
* Eventhogh, contrary to the expectation, number of pickups is much lower during the morning peak hours, it is highest in late evenings as expected.
* Number of pickups on sundays and mondays are much lower than other days with a peak on Saturday.
* Monthly plot shows significant variation towards the end of the month.
* Annual variation visible with first half of the year showing higher trips compared to the lower half.


In [None]:
plt.figure(figsize=(22, 6))
#fig, axs = plt.subplot(ncols=2)

# Passenger Count
plt.subplot(121)
sb.countplot(df['passenger_count'])
plt.xlabel('Passenger Count')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Passenger Count')

plt.subplot(122)
sb.boxplot(df['passenger_count'], color = 'cyan', showmeans=True,
           meanprops={"marker":"o", "markerfacecolor":"Red",
                      "markeredgecolor":"black","markersize":"10"}
)
plt.xlabel('Passenger Count')
plt.title('Box plot of Passenger count');

**Findings:**

Most of the trips involve only 1 passenger. There are trips with zero passengers but they are very low in number.

In [None]:
plt.figure(figsize=(22, 6))
sb.boxplot(x = df['passenger_count'],y = df['fare_amount'], color = 'cyan', showmeans=True,
            meanprops={"marker":"o", "markerfacecolor":"Red", "markeredgecolor":"black","markersize":"10"}
)
plt.xlabel('Passenger Count')
plt.title ("Fare amount vs No. of passengers");

Box plot shows no significant vatiation means  of between passenger counts.

In [None]:
plt.figure(figsize=(15, 6))
sb.boxplot(x = df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'],y = df['passenger_count'], color = 'cyan', showmeans=True,
            meanprops={"marker":"o", "markerfacecolor":"Red", "markeredgecolor":"black","markersize":"10"}
)
plt.xlabel('Passenger Count')
plt.title ("No. of passengers vs Days of week");

In [None]:
# Datetime features
plt.figure(figsize=(22, 8))

# Hour of day
plt.subplot(221)
sb.barplot(df['pickup_hour'], y = df['passenger_count'], palette = 'hsv')
plt.xlabel('Hour of Day')
plt.ylabel('Passenger count')
plt.title ("Passenger count vs Hour of Day")

# Day of week
plt.subplot(222)
sb.barplot(df['pickup_month'], y = df['passenger_count'],palette = 'hsv')
plt.xlabel('Month')
plt.ylabel('Passenger count')
plt.title ("Passenger count vs Month")

# Date
plt.subplot(223)
sb.barplot(x = df['pickup_date'], y = df['passenger_count'], palette = 'hsv')
plt.xlabel('Date')
plt.ylabel('Passenger count')
plt.title ("Passenger count vs Date")

# Month
plt.subplot(224)
sb.barplot(x = df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'],
           y = df['passenger_count'], palette = 'hsv')
plt.xlabel('Days of week')
plt.ylabel('Passenger count')
plt.title ("Passenger count vs Days of week")
plt.tight_layout();

* Passenger count is more at night from 22:00 to 03:00
* Passenger count is more on weekdays, compared to weekends

In [None]:
plt.figure(figsize=(15, 6))
sb.boxplot(x = df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'],
           y = df['fare_amount'], palette = 'rainbow', showmeans=True,
            meanprops={"marker":"o", "markerfacecolor":"Red", "markeredgecolor":"black",
                       "markersize":"10"}
)
plt.xlabel('Fare amount')
plt.title ("Fare amount vs Days of week");

No significant variation of means between days of week also.

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Dealing with Distance Outliers</span>**

Outliers can be dealt in three different ways as follows:
1. Replace using central tendancy (median for continuous values and mode for categorical values)
2. Replace using Whisker values
3. Selecting only those within whisker values.

In [None]:
sb.distplot(df['haversine_distance'], bins = 20);

In [None]:
df['haversine_distance'].describe(), print("Median       ", df['haversine_distance'].median())

In [None]:
df['haversine_distance'].quantile(0.25), df['haversine_distance'].quantile(0.75)

In [None]:
IQR = df['haversine_distance'].quantile(0.75) - df['haversine_distance'].quantile(0.25)
IQR

In [None]:
Q1 = df['haversine_distance'].quantile(0.25)
Q3 = df['haversine_distance'].quantile(0.75)
whisker_1 = Q1 - (1.5*IQR)
whisker_2 = Q3 + (1.5*IQR)

whisker_1, whisker_2

Replacing outliers with whisker values is one of the methods of treating outliers. Values at the lower end should be imputed by the lower whisker (i.e., Q1-1.5* IQR) and the values at the upper end should be imputed by the upper whisker (ie., Q3+1.5* IQR). Here selecting only those rows containing  distance values within upper whisker.

In [None]:
df = df.loc[(df['haversine_distance']!=0) & (df['haversine_distance']<8)]
df.shape

In [None]:
sb.distplot(df['haversine_distance'], bins = 20)
plt.show()

**Target Exploration with distance**

In [None]:
from scipy import stats
x = df['haversine_distance']
y = df['fare_amount']
slope, intercept, r_value, p_value, std_err = stats.linregress(df['haversine_distance'],df['fare_amount'])
ax = sb.regplot(x, y, line_kws={'label':"y={0:.1f}x+{1:.1f}".format(slope,intercept),
                                "color": "red"},scatter_kws={"color": "cyan"})
ax.legend();

In [None]:
#sb.lmplot(x="haversine_distance", y="fare_amount", data=df );
sb.relplot(x="haversine_distance", y="fare_amount", data=df, kind="scatter");

In [None]:
# Datetime features
plt.figure(figsize=(22, 8))

# Hour of day
plt.subplot(221)
sb.barplot(df['pickup_hour'], y = df['haversine_distance'], palette = 'tab20')
plt.xlabel('Hour of Day')
plt.ylabel('Distance in Km')
plt.title ("Distance in Km vs Hour of Day")

# Day of week
plt.subplot(222)
sb.barplot(df['pickup_month'], y = df['haversine_distance'],palette = 'tab20',estimator = np.mean)
plt.xlabel('Month')
plt.ylabel('Distance in Km')
plt.title ("Distance in Km vs Month")

# Date
plt.subplot(223)
sb.barplot(x = df['pickup_date'], y = df['haversine_distance'], palette = 'tab20')
plt.xlabel('Date')
plt.ylabel('Distance in Km')
plt.title ("Distance in Km vs Date")

# Month
plt.subplot(224)
sb.barplot(x = df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'],
           y = df['haversine_distance'], palette = 'tab10')
plt.xlabel('Days of week')
plt.ylabel('Distance in Km')
plt.title ("Distance in Km vs Days of week")
plt.tight_layout();

Distance travelled is comparitively lower during daytime than at night.

In [None]:
# Datetime features
plt.figure(figsize=(22, 8))

# Hour of day
plt.subplot(221)
sb.barplot(df['pickup_hour'], y = df['fare_amount'], palette = 'tab20')
plt.xlabel('Hour of Day')
plt.ylabel('Fare amount')
plt.title ("Fare amount vs Hour of Day")

# Day of week
plt.subplot(222)
sb.barplot(df['pickup_month'], y = df['fare_amount'],palette = 'tab20')
plt.xlabel('Month')
plt.ylabel('Fare amount')
plt.title ("Fare amount vs Month")

# Date
plt.subplot(223)
sb.barplot(x = df['pickup_date'], y = df['fare_amount'], palette = 'tab20')
plt.xlabel('Date')
plt.ylabel('Fare amount')
plt.title ("Fare amount vs Date")

# Month
plt.subplot(224)
sb.barplot(x = df['pickup_weekday'], order = ['Monday', 'Tuesday', 'Wednesday',
                                           'Thursday', 'Friday', 'Saturday', 'Sunday'],
           y = df['fare_amount'], palette = 'tab10')
plt.xlabel('Days of week')
plt.ylabel('Fare amount')
plt.title ("Fare amount vs Days of week")
plt.tight_layout();

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Lattitude & Longitude</span>**
Lets look at the geospatial or location features to check consistency. They should not vary much as we are only considering trips within New York city.

In [None]:
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)
sb.despine(left=True)
sb.distplot(df['pickup_latitude'].values, label = 'pickup_latitude',color="b",bins = 100, ax=axes[0,0])
sb.distplot(df['pickup_longitude'].values, label = 'pickup_longitude',color="r",bins =100, ax=axes[1,0])
sb.distplot(df['dropoff_latitude'].values, label = 'dropoff_latitude',color="b",bins =100, ax=axes[0,1])
sb.distplot(df['dropoff_longitude'].values, label = 'dropoff_longitude',color="r",bins =100, ax=axes[1,1])
plt.setp(axes, yticks=[])
plt.tight_layout()
plt.show()

**Findings** - (Here, red represents pickup and dropoff Longitudes & blue represents pickup & dropoff lattitudes)

1. From the plot above it is clear that pick and drop latitude are centered around 40 to 41, and longitude are situated around -74 to -73.
2. Some extreme co-ordinates has squeezed the plot such that we see a spike here
3. A good idea is to remove these outliers and look at the distribution more closely

In [None]:
df = df.loc[(df.pickup_latitude > 40.6) & (df.pickup_latitude < 40.9)]
df = df.loc[(df.dropoff_latitude>40.6) & (df.dropoff_latitude < 40.9)]
df = df.loc[(df.dropoff_longitude > -74.05) & (df.dropoff_longitude < -73.7)]
df = df.loc[(df.pickup_longitude > -74.05) & (df.pickup_longitude < -73.7)]
df_data_new = df.copy()
sb.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)#
sb.despine(left=True)
sb.distplot(df_data_new['pickup_latitude'].values, label = 'pickup_latitude',color="b",bins = 100, ax=axes[0,0])
sb.distplot(df_data_new['pickup_longitude'].values, label = 'pickup_longitude',color="r",bins =100, ax=axes[0,1])
sb.distplot(df_data_new['dropoff_latitude'].values, label = 'dropoff_latitude',color="b",bins =100, ax=axes[1, 0])
sb.distplot(df_data_new['dropoff_longitude'].values, label = 'dropoff_longitude',color="r",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()

plt.show()

* We have a much better view of the distribution of coordinates instead of spikes. And we see that most trips are concentrated between these lat long only with a few significant clusters.
* These clusters are represented by the numerous peaks in the lattitude and longitude histograms.

### **<span style = 'color:brown'>Distribution Plot of Target Variable</span>**

In [None]:
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
hv.Distribution(df['fare_amount']).opts(title="Fare Amount Distribution", color="red",
                                                        xlabel="Fare Amount", ylabel="Density")\
.opts(opts.Distribution(width=700, height=300,tools=['hover'],show_grid=True))

[Multicollinearity](https://www.statology.org/how-to-calculate-vif-in-python/) in regression analysis occurs when two or more explanatory variables are highly correlated with each other, such that they do not provide unique or independent information in the regression model. If the degree of correlation is high enough between variables, it can cause problems when fitting and interpreting the regression model.

One way to detect multicollinearity is by using a metric known as the variance inflation factor (VIF), which measures the correlation and strength of correlation between the explanatory variables in a regression model. The value for VIF starts at 1 with no upper limit.

* VIF value of 1 indicates no multicollinearity
* VIF values between 1to 5 indicates moderate multicollinearity, though not severe enough to pay attention
* VIF value greater than 5 indicates potentially severe multicollinearity


In [None]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
# the independent variables set
X =df.drop(['key', 'pickup_datetime','pickup_weekday', 'fare_amount', 'base_fare', 'fare'], axis = 1)

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

As we can see, pickup_latitude, pickup_longitude, dropoff_longitude and dropoff_latitude have very high values of VIF. A value greater than 5 indicates potentially severe correlation between a given explanatory variable and other explanatory variables in the model. In this case, the coefficient estimates and p-values in the regression output are likely unreliable. Hence, it will be better to drop two of these four variables from the model.

### **<span style = 'color:brown'>Correlation matrix</span>**
Correlation heatmap to check the correlations amongst all features.

In [None]:
plt.figure(figsize = (12,6))
sb.heatmap(df.drop(['key', 'pickup_datetime','pickup_weekday'], axis = 1).corr(),
           cmap ='BuGn', annot = True);

From the correlation heatmap it is clear that the distance, lattitude and longitude features have higher correlation with the target as compared to the other features.

In [None]:
df[['pickup_latitude', 'pickup_longitude', 'dropoff_longitude', 'dropoff_latitude']].corr()

AS seen from the above correlation table, dropoff latitude is having high correlation with pickup latitude; the same is true for longitude also. So dropoff latitude and longitude is going to be dropped from the analysis, as only one each among latitude and longitude is required.

### **<span style = 'color:brown'>Data scaling & Train Test split</span>**

In [None]:
df.info()

We have all numerical data types in our dataset now. Time to delve into Standardization followed by model building.

In [None]:
X = df.drop(['key', 'pickup_datetime','pickup_weekday', 'fare_amount', 'fare', 'base_fare', 'dropoff_latitude', 'dropoff_longitude'],
            axis = 1)
y = df['fare_amount']

Data Standardization give data zero mean and unit variance

In [None]:
from sklearn import preprocessing
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

Keep one third of the data in test set and remaining two third of data in the train set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.ndim)
print(y_train.ndim)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### **<span style = 'color:brown'>Mean Prediction</span>**
Before we go on to try any machine learning model, let us look at the performance of a basic model that just says the mean of fare amount in the train set is the prediction for all the trips in the test set.

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
mean_pred = np.repeat(y_train.mean(),len(y_test))
sqrt(mean_squared_error(y_test, mean_pred))

## **<span style = 'color:green'>7. Model Development**<a name ='Model'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)


The following algorithms are going to be used to build regression models:<br>
* Linear Regressor
* SGDRegressor
* Ridge Regressor
* Lasso Regressor
* ElasticNet Regressor
* KNeighbors Regressor
* Support Vector Regressor
* Decision Tree Regressor
* Extra Trees Regressor
* Isolation Forest Regressor
* Random Forest Regressor
* Bagging Regressor
* AdaBoost Regressor
* Gradient Boosting Regressor
* XGB Regressor
* CatBoost Regressor
* MLPRegressor


In [None]:
pip install -q --upgrade linear-tree

### **<span style = 'color:brown'>Cross Validation(CV)</span>**
Cross Validation(CV) or K-Fold Cross Validation (K-Fold CV) is used to generate multiple (k) train-test sets instead of 1. In a k fold cross-validation, the training data will be divided into k equal parts. In the first step, one part out of the k is set as validation data and the remaining k-1 as train data.  This is repeated k times using a different part out of the k, each time to test the model upon.  K-fold cross validation can essentially help to combat overfitting too.
Different regression models can be evaluated based on the CV scores. The cross_val_score for regression calculates the R squared metric for the applied model. R squared error close to 1 implies a better fit and less error.

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Linear Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)
A linear relationship is one in which increment / decrement in one variable leads to the increment/decrement of the other. A Linear regression model expresses relationship between dependant and independant variables which can be demonstrated by a straight line.   

* Y = mX + C is the equation of a straight line, where, Y is the dependent variable, X is the independent variable,
* C refers to the intercept of the regression line0, in other words: the value of Y when X is 0
* m refers to the slope of the regression line, in other words: the value with which Y changes when X increases by 1 unit.

* Cost function of Linear Regression is Mean Squared Error (MSE) or Root Mean Squared Error (RMSE) between the predicted value and the actual value. Lower the cost function better the model.
* Gradient descent is the optimization technique  used by various supervised machine learning algorithms including  Linear regression to minimize the cost function in order to get the best values of m and C. Gradient descent works iteratively calculating error at each term, moving in the direction of lower error by optimizing model parameters until model converges to minimal cost.
* We want gradient descent to reach the global minima of a convex cost function in order to get the optimal model. Random initialization and adjusting learning rate can help us in reaching the global minima of the function to find the best model.

Assumptions of Linear Regression:
* Linear Relationship between dependent and independent variables.
* No correlation of error terms
* Constant variance of error terms
* No correlation among independent variables,
* Errors normally distributed


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
np.mean(cross_val_score(lr, X_train, y_train, cv=5))

Use model_selection.cross_validate (with return_estimator=True) instead of cross_val_score. It's a lot more flexible so the estimators used for each fold can be accessed

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(lr, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)
df_train = df.drop(['key', 'pickup_datetime','pickup_weekday', 'fare_amount', 'fare', 'base_fare',
                    'dropoff_latitude', 'dropoff_longitude'], axis = 1)
coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Ridge Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

A regression model that uses L2 regularization technique is called  Ridge Regression. When more features are added to a linear regression model, it can lead to overfitting, where the model performs well on train but not on test dataset. Values of coefficients also get larger. In ridge regression, a regularization term, which is a sum of "squared magnitude" of all coefficients, is added as penalty to the cost function of linear regression algorithm. This decreases the coefficients values significantly and hence the effect of least significant features gets reduced.

Here, if regularization term, lambda is zero then the model get back to OLS. However, if lambda is very large then it will add too much weight and it will lead to under-fitting. Having said that it’s important how lambda is chosen. This technique works very well to avoid over-fitting issue.

In [None]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(cv=5).fit(X_train, y_train)
ridge.score(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(ridge, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)

coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Lasso Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

Lasso (Least Absolute Shrinkage and Selection Operator) regression is a type of linear regression similar to Ridge regression. In Lasso, L1 regularization, which is a sum of “absolute values of magnitude” of all coeffients are applied as a penalty to the cost function. In Ridge regression, coefficients go down, but none of them all the way up to zero. So there will be difficulty in interpretation because we still have the same number of parameters even though with less intensity. Compared to that, Lasso can give us less number of parameters or less number of features. Here some of coefficients are actually dropped from the model. Thus the key difference between these techniques is that Lasso shrinks the less important feature’s coefficient to zero thus, removing some feature altogether. So, this works well for feature selection in case we have a huge number of features.

Again, if lambda is zero then the model get back to OLS whereas very large value will make coefficients zero hence it will under-fit.

In [None]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5).fit(X_train, y_train)
lasso.score(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(lasso, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)
coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Elasticnet Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

This is also a type of linear regression in which both Lasso (L1) and Ridge (L2) regularization parameters combined together are added to the cost function.

In [None]:
from sklearn.linear_model import ElasticNetCV
elastic = ElasticNetCV(cv=5).fit(X_train, y_train)
elastic.score(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(elastic, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)
coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Polynomial Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
cv_score=[]
for i in range(1,4):
    poly_reg = PolynomialFeatures(degree = i)
    X_poly = poly_reg.fit_transform(X_train)
    poly_reg = LinearRegression()
    cv_score.append(np.mean(cross_val_score(poly_reg,X_poly,y_train,cv=5)))
x = range(1,4)
plt.scatter(x,cv_score)
plt.xticks(ticks=[1,2,3], labels=['Degree_1', 'Degree_2', 'Degree_3']);

In [None]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(poly_reg, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)
coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">K-Nearest Neighbor (KNN)</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

K- Nearest Neighbor (KNN), also known as Lazy learning algorithm observes the nature of nearest neighbors. KNN is a supervised ML algorithm.  In the KNN algorithm, “K’ means the number of nearest neighbors the model will consider. If it is a classification problem then we assign a mode to the instance and if it is a regression problem then we assign a mean or median of the distances of K nearest neighbors to the new unknown variable. .

Determining value of K:
* Choose a range of values for k
* For each value, implement a KNN model
* Calculate error (Or, R_squared value) corresponding to each K value and plot it.  

The elbow curve is used to find the value of k to be used in the model. The value of k where the error is minimum gets selected. R_squared value is inversely proportional to the error. Higher the R2 score better will be the model.

How to calculate the distance?
* Manhattan Distance: Sum of Absolute difference between the two points, across all dimensions.
* Euclidean Distance is the shortest distances between two points. It is the square root of  sum of squares of distance between two points.
* Minkowski distance is a generalized version of the above distance calculations. It is the p-th root of sum of squares of distance between two points.
* Hamming distance is the total number of differences between two strings of identical length.

Hamming distance is used for categorical data whereas Manhattan distance and Euclidian distance are for continuous data.

Manhattan distances can be thought of as the sum of the sides of a right-angled triangle while Euclidean distances represent the hypotenuse of the triangle. Hence, Manhattan distances are usually larger than Euclidean distances. In the Minkowski equation, when p=2, we get our familar Euclidian distance (also referred to as the L2-norm or L2 Distance). When p equals 1 we get the Manhattan distance. Manhattan distances can also find them selves called L1 norm, L1 distance, and even LASSO.

As p tends to infinity we get another famous distance, the Chebyshev distance. Manhattan, Euclidean, Chebyshev, and Minkowski distances are part of the scikit-learn DistanceMetric class and can be used to tune regressors / classifiers such as KNN or clustering alogorithms such as DBSCAN.

In [None]:
from sklearn.neighbors import KNeighborsRegressor
cv_score=[]
for i in range(1,10):
 knn = KNeighborsRegressor(n_neighbors= i)
 cv_score.append(np.mean(cross_val_score(knn,X_train, y_train,cv=5)))
x = range(1,10)
plt.scatter(x,cv_score);

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Decision Tree</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)

A decision tree is a type of machine learning model that is used to predict the value of a target variable. Decision trees are created by splitting the data set into smaller and smaller subsets, in such a way that the resultamt nodes are as homogeneous as possible, until each subset contains only one data point. So the objective of the decision tree is to have pure nodes which contains 100% of one class and 0% of the other classes. When a sub node splits into further sub nodes, then it is known as decision nodes.  The terminal node (or leaves) lies at the bottom of the decision tree. It has no further nodes coming off.  The distance between parent root node and the longest terminal leaf nodes is called the depth of the tree.

Decision tree splits the nodes on all available variables. It selects the split which results in most homogeneous sub-nodes. The best split is decided based on Gini impurity, Chi-Square,  Entropy / Information Gain or reduction in variance. The first three parameters are used when target variable is categorical and reduction in variance is uded for continuous variable.

Decision tree regression using reduction in variance selects the split with lower variance. It calculate the variance at each split as weighted average variance of each child node. A node having high variance means it is more impure. Since we seek the pure nodes after splitting the variable having low variance should be selected for splitting.

Optimizing parameters:

1. Minimum samples for a node split
    > a. Higher values controls overfitting
    
    > b. Too high values can lead to underfitting
2. Minimum samples for a terminal node
    > a. Higher values controls overfitting
    
    > b. Too high values can lead to underfitting
3. Maximum depth of tree
    > a. Higher depth can lead to overfitting
    
    > b.Lower depth can lead to underfitting
4. Maximum number of terminal nodes

In [None]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()
R_Squared = np.mean(cross_val_score(DT, X_train, y_train, cv=5))
Standard_deviation = np.std(cross_val_score(DT, X_train, y_train, cv=5))
print('R2 of Decision Tree Regression model is:',R_Squared)
print('Standard deviation of R2 of Decision Tree Regression model is:',Standard_deviation)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
R_Squared = np.mean(cross_val_score(rf, X_train, y_train, cv=5))
Standard_deviation = np.std(cross_val_score(rf, X_train, y_train, cv=5))
print('R2 of Random Forest Regression model is:',R_Squared)
print('Standard deviation of R2 of Random Forest Regression model is:',Standard_deviation)

#### **<span style="font-family: Segoe UI; font-size:1.0em;color:magenta;">Gradient boosting Regression</span>**
[<div style="text-align: right"> Back to Table of contents</div>](#Table)
Most machine learning models focus on minimizing the prediction error, therby improving the prediction power done by a single model. However, boosting algorithms improve the quality of prediction by training a series of weak models on different subsets of the data, each compensating the weaknesses of its predecessors and then combining the predictions of these models to create a final prediction that is more accurate than the predictions of any individual model.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
np.mean(cross_val_score(GB, X_train, y_train, cv=5))

Different regression models were evaluated based on the CV scores and it’s observed that Linear, Ridge,CatBoost,LGBM, SGD, Random Forest, Gradient Boosting regression best fits the data compared to all the other methods.  From cross validation it can be concluded that Haversine distance is the most prominant feature in describing the target variable. It is possible to try other functions of this feature to examine whether adding more features by feature transforming this variable brings about any improvement to the model.

**Feature transformation of Haversine distance**

Feature transformation does change the distribution of the variable. When we have skewed distribution we use transformation to remove the skewness of the variable and hence changing the distribution. Functions like Log, square / cube, Square root / cube root, reciprocals can be used for feature transformation to reduce skewness of the variable.

Log transformation, square root and other nth roots are used for removing right skewness from the data. On the other hand, left skewed data distribution can be transformed using nth power or exponents functions

If we are taking the log of the variable that is negative or zero, it might show an error, as log of zero is undefined. Instead take log(x+c) where c is a constant with the objective that the log of the input must be greater than zero.


In [None]:
df['haversine_distance_log'] = np.log(df['haversine_distance'].values + 1)
df['haversine_distance_sqrt'] = np.sqrt(df['haversine_distance'].values)
df['haversine_distance_sq'] = df['haversine_distance'].values**2
df_test['haversine_distance_log'] = np.log(df_test['haversine_distance'].values + 1)
df_test['haversine_distance_sqrt'] = np.sqrt(df_test['haversine_distance'].values)
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)#
sb.despine(left=True)
sb.distplot(df['haversine_distance'], label = 'haversine_distance',color="b",bins = 100, ax=axes[0,0])
axes[0,0].set_title('Histogram of distance')
sb.distplot(df['haversine_distance_log'], label = 'haversine_distance_log',color="yellow",bins =100, ax=axes[0,1])
axes[0,1].set_title('Histogram of log of distance')
sb.distplot(df['haversine_distance_sqrt'], label = 'haversine_distance_sqrt',color="magenta",bins =100, ax=axes[1, 0])
axes[1,0].set_title('Histogram of sqrt of distance')
sb.distplot(df['haversine_distance_sq'], label = 'haversine_distance_sq',color="green",bins =100, ax=axes[1, 1])
axes[1,1].set_title('Histogram of square of distance')
plt.setp(axes, yticks=[])
plt.tight_layout()

plt.show()

It is clearly observed how Log and Square root transformation reduced the skewness of the distribution, whereas square root increased the right skewness further.

Including 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', back after  a lot of trial and error to get an improved score.

In [None]:
df_train = df.drop(['key', 'pickup_datetime','pickup_weekday', 'fare', 'fare_amount', 'base_fare',
                    'haversine_distance', 'haversine_distance_sq', 'haversine_distance_sqrt'], axis = 1)
df_test_copy = df_test.drop(['key', 'base_fare', 'pickup_datetime','pickup_weekday', 'base_fare','haversine_distance',
                             'haversine_distance_sqrt'], axis = 1)
#df_test_copy = df_test.drop(['key', 'base_fare', 'pickup_datetime','pickup_weekday', 'base_fare','haversine_distance', 'haversine_distance_sqrt','pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ], axis = 1)
X = df_train.copy()
y = df['fare_amount']
df_train.columns, df_test_copy.columns

In [None]:
scaler = preprocessing.StandardScaler()
X= scaler.fit(X).transform(X)
test_X= scaler.transform(df_test_copy)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.model_selection import cross_validate
cv_results = cross_validate(ridge, X_train, y_train, cv=5, return_estimator=True)

Coefficient = []
for model in cv_results['estimator']:
    Coefficient.append(model.coef_)

coefficient = pd.DataFrame(Coefficient, columns = df_train.columns)
abs(coefficient.mean(axis =0)).sort_values(ascending = False)

In [None]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(cv=5).fit(X_train, y_train)
ridge.score(X_train, y_train)

In [None]:
df_test.head()

## **<span style = 'color:green'>8. Model Evaluation & Kaggle Submission</span>**<a name ='Report'></a>
[<div style="text-align: right"> Back to Table of contents</div>](#Table)
Define a function to evaluate the model.

In [None]:
def model_train_evaluation(y, ypred, model_name):

    # Model Evaluation metrics
    from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score, r2_score, mean_absolute_percentage_error
    print("\n \n Model Evaluation Report: ")
    print('Mean Absolute Error(MAE) of', model_name,':', mean_absolute_error(y, ypred))
    print('Mean Squared Error(MSE) of', model_name,':', mean_squared_error(y, ypred))
    print('Root Mean Squared Error (RMSE) of', model_name,':', mean_squared_error(y, ypred, squared = False))
    print('Mean absolute percentage error (MAPE) of', model_name,':', mean_absolute_percentage_error(y, ypred))
    print('Explained Variance Score (EVS) of', model_name,':', explained_variance_score(y, ypred))
    print('R2 of', model_name,':', (r2_score(y, ypred)).round(2))
    print('\n \n')

    # Actual vs Predicted Plot
    f, ax = plt.subplots(figsize=(12,6),dpi=100);
    plt.scatter(y, ypred, label="Actual vs Predicted")
    # Perfect predictions
    plt.xlabel('Fare amount')
    plt.ylabel('Fare amount')
    plt.title('Expection vs Prediction')
    plt.plot(y,y,'r', label="Perfect Expected Prediction")
    plt.legend()
    f.text(0.95, 0.06, 'AUTHOR: RINI CHRISTY',
         fontsize=12, color='green',
         ha='left', va='bottom', alpha=0.5);
    plt.show()

### **<span style = 'color:brown'>Linear regression evaluation</span>**

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit (X_train, y_train)
Yhat_lr = lr.predict(X_test)
model_train_evaluation(y_test, Yhat_lr, 'Linear regression Model')

In [None]:
test_pred = lr.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>Ridge regression evaluation</span>**

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit (X_train, y_train)
Yhat_ridge = ridge.predict(X_test)
model_train_evaluation(y_test, Yhat_ridge, 'Ridge regression Model')

In [None]:
test_pred = ridge.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>Random Forest regression evaluation</span>**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit (X_train, y_train)
Yhat_rf = rf.predict(X_test)
model_train_evaluation(y_test, Yhat_rf, 'Random Forest Regression Model')

In [None]:
test_pred = rf.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>CatBoost regression evaluation</span>**

In [None]:
from catboost import CatBoostRegressor
Cat = CatBoostRegressor(loss_function='RMSE', learning_rate = 0.1,
                        max_depth = 5,  n_estimators = 100, silent = True)
Cat.fit (X_train, y_train)
Cat.fit (X_train, y_train)
Yhat_Cat = Cat.predict(X_test)
model_train_evaluation(y_test, Yhat_Cat, 'Ridge regression Model')

In [None]:
test_pred = Cat.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>SGD regression evaluation</span>**

In [None]:
from sklearn.linear_model import SGDRegressor
SGD = SGDRegressor()
SGD.fit (X_train, y_train)
Yhat_SGD = SGD.predict(X_test)
model_train_evaluation(y_test, Yhat_SGD, 'SGD Regression Model')

In [None]:
test_pred = SGD.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>LGBM regression evaluation</span>**

In [None]:
from lightgbm import LGBMRegressor
LGBM = LGBMRegressor (boosting_type = 'gbdt', num_leaves = 31,  learning_rate = 0.1,
                       max_depth = 5, n_estimators = 100, silent = True)
LGBM.fit (X_train, y_train)
Yhat_LGBM = LGBM.predict(X_test)
model_train_evaluation(y_test, Yhat_LGBM, 'LGBM Regression Model')

In [None]:
test_pred = LGBM.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

### **<span style = 'color:brown'>GradientBoosting regression evaluation</span>**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
GB.fit (X_train, y_train)
Yhat_GB = GB.predict(X_test)
model_train_evaluation(y_test, Yhat_GB, 'Gradient Boosting Regression Model')

In [None]:
test_pred = GB.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare_amount'])
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()

In [None]:
Submission.to_csv('Submission.csv', index = False)

**Using fare column instead of fare_amount column**

In [None]:
df_test['haversine_distance_log'] = np.log(df_test['haversine_distance'].values + 1)
df_test['haversine_distance_sqrt'] = np.sqrt(df_test['haversine_distance'].values)
df_train = df.drop(['key', 'pickup_datetime','pickup_weekday', 'fare', 'fare_amount', 'base_fare', 'haversine_distance_sq',
                    'pickup_hour'], axis = 1)
df_test_copy = df_test.drop(['key', 'base_fare', 'pickup_datetime','pickup_weekday', 'pickup_hour'], axis = 1)
X = df_train.copy()
y = df['fare']
scaler = preprocessing.StandardScaler()
X= scaler.fit(X).transform(X)
test_X= scaler.transform(df_test_copy)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from lightgbm import LGBMRegressor
LGBM = LGBMRegressor (boosting_type = 'gbdt', num_leaves = 31,  learning_rate = 0.1,
                       max_depth = 5, n_estimators = 100, silent = True)
LGBM.fit (X_train, y_train)
Yhat_LGBM = LGBM.predict(X_test)
model_train_evaluation(y_test, Yhat_LGBM, 'LGBM Regression Model')

In [None]:
test_pred = LGBM.predict(test_X)
Submission = pd.DataFrame(test_pred, columns = ['fare'])
Submission['fare_amount'] = df_test['base_fare'] + Submission['fare']
Submission['key'] = df_test['key']
Submission = Submission[['key', 'fare_amount']]
Submission.head()