# Project 
### Predict the amount of the Taxi

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
warnings.filterwarnings("ignore")

In [2]:
sns.set_theme(palette='Set2',
              font='Lucida Calligraphy',
              font_scale=1.0,
             )

import matplotlib
matplotlib.rcParams.update({'font.size': 10})
plt.style.use('dark_background')

In [3]:
df=pd.read_csv("TaxiFare.csv")

In [4]:
df.sample(5)

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
28091,06:00.0,31.33,2013-09-06 15:06:00 UTC,-73.87491,40.774027,-73.974885,40.756812,1
17975,08:00.0,7.7,2010-07-12 08:08:00 UTC,-73.996232,40.738348,-73.971528,40.750785,1
29717,22:21.0,13.5,2012-12-13 01:22:21 UTC,-73.977902,40.762968,-73.993989,40.723203,1
24774,43:00.0,4.5,2009-12-10 12:43:00 UTC,-73.996792,40.742432,-73.996442,40.735553,1
31278,04:37.0,24.04,2015-06-28 13:04:37 UTC,-73.954079,40.770542,-73.917381,40.770882,1


In [5]:
print(df.head())

  unique_id  amount      date_time_of_pickup  longitude_of_pickup  \
0   26:21.0     4.5  2009-06-15 17:26:21 UTC           -73.844311   
1   52:16.0    16.9  2010-01-05 16:52:16 UTC           -74.016048   
2   35:00.0     5.7  2011-08-18 00:35:00 UTC           -73.982738   
3   30:42.0     7.7  2012-04-21 04:30:42 UTC           -73.987130   
4   51:00.0     5.3  2010-03-09 07:51:00 UTC           -73.968095   

   latitude_of_pickup  longitude_of_dropoff  latitude_of_dropoff  \
0           40.721319            -73.841610            40.712278   
1           40.711303            -73.979268            40.782004   
2           40.761270            -73.991242            40.750562   
3           40.733143            -73.991567            40.758092   
4           40.768008            -73.956655            40.783762   

   no_of_passenger  
0                1  
1                1  
2                2  
3                1  
4                1  


In [6]:
print("The number of rows:",df.shape[0])
print("The number of columns:",df.shape[1])

The number of rows: 50000
The number of columns: 8


In [7]:
df.rename(columns={'TaxiFare':'CLV'},inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             50000 non-null  object 
 1   amount                50000 non-null  float64
 2   date_time_of_pickup   50000 non-null  object 
 3   longitude_of_pickup   50000 non-null  float64
 4   latitude_of_pickup    50000 non-null  float64
 5   longitude_of_dropoff  50000 non-null  float64
 6   latitude_of_dropoff   50000 non-null  float64
 7   no_of_passenger       50000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.1+ MB


In [9]:
numerical_cols = df.select_dtypes(include=["int64","float64"])

In [10]:
numerical_cols.columns

Index(['amount', 'longitude_of_pickup', 'latitude_of_pickup',
       'longitude_of_dropoff', 'latitude_of_dropoff', 'no_of_passenger'],
      dtype='object')

In [None]:
numerical_cols = numerical_cols.drop(["amount"],axis=1)

In [17]:
numerical_cols.describe()

Unnamed: 0,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0
mean,39.933759,-72.504616,39.926251,1.66784
std,6.224857,10.40757,6.014737,1.289195
min,-74.006893,-84.654241,-74.006377,0.0
25%,40.73488,-73.991152,40.734372,1.0
50%,40.752678,-73.980082,40.753372,1.0
75%,40.76736,-73.963584,40.768167,2.0
max,401.083332,40.851027,43.41519,6.0


In [19]:
df.isnull().sum()

unique_id               0
amount                  0
date_time_of_pickup     0
longitude_of_pickup     0
latitude_of_pickup      0
longitude_of_dropoff    0
latitude_of_dropoff     0
no_of_passenger         0
dtype: int64

In [22]:
cat_cols = df.select_dtypes(include="object")

In [24]:
no_col = df[["amount","no_of_passenger"]]

In [25]:
cat_cols = pd.concat([cat_cols,no_col],axis=1)

In [26]:
cat_cols.head()

Unnamed: 0,unique_id,date_time_of_pickup,amount,no_of_passenger
0,26:21.0,2009-06-15 17:26:21 UTC,4.5,1
1,52:16.0,2010-01-05 16:52:16 UTC,16.9,1
2,35:00.0,2011-08-18 00:35:00 UTC,5.7,2
3,30:42.0,2012-04-21 04:30:42 UTC,7.7,1
4,51:00.0,2010-03-09 07:51:00 UTC,5.3,1


In [27]:
cat_cols.drop("amount",axis=1,inplace=True)

In [28]:
cat_cols.columns

Index(['unique_id', 'date_time_of_pickup', 'no_of_passenger'], dtype='object')

In [29]:
for i in cat_cols:
    print("Unique values in",str(i),"is",df[i].nunique())
    print(df[i].value_counts())
    print("-----------------------------------------")

Unique values in unique_id is 3597
unique_id
26:00.0    457
33:00.0    439
11:00.0    436
32:00.0    429
20:00.0    424
          ... 
56:16.0      1
40:52.0      1
53:10.0      1
15:21.0      1
53:22.0      1
Name: count, Length: 3597, dtype: int64
-----------------------------------------
Unique values in date_time_of_pickup is 49555
date_time_of_pickup
2011-09-03 01:30:00 UTC    3
2014-05-30 23:38:00 UTC    3
2012-06-28 20:54:00 UTC    3
2011-02-11 13:19:00 UTC    3
2014-05-30 10:50:00 UTC    2
                          ..
2011-10-07 00:24:25 UTC    1
2015-04-02 06:47:05 UTC    1
2011-01-09 10:09:58 UTC    1
2012-10-27 18:11:00 UTC    1
2010-01-13 08:13:14 UTC    1
Name: count, Length: 49555, dtype: int64
-----------------------------------------
Unique values in no_of_passenger is 7
no_of_passenger
1    34808
2     7386
5     3453
3     2183
4     1016
6      989
0      165
Name: count, dtype: int64
-----------------------------------------


In [32]:
df["no_of_passenger"].value_counts(normalize=True)*100

no_of_passenger
1    69.616
2    14.772
5     6.906
3     4.366
4     2.032
6     1.978
0     0.330
Name: proportion, dtype: float64

In [33]:
df["amount"].value_counts(normalize=True)*100

amount
6.50     4.936
4.50     4.208
8.50     3.858
6.10     2.882
5.30     2.880
         ...  
26.60    0.002
25.39    0.002
55.83    0.002
30.04    0.002
27.05    0.002
Name: proportion, Length: 703, dtype: float64

In [35]:
df["longitude_of_pickup"].value_counts(normalize=True)*100

longitude_of_pickup
 0.000000     1.910
-73.137393    0.036
-73.981405    0.018
-73.980947    0.016
-73.982268    0.016
              ...  
-73.969862    0.002
-73.988610    0.002
-74.000758    0.002
-73.969658    0.002
-73.932603    0.002
Name: proportion, Length: 33716, dtype: float64

In [36]:
cat_cols.columns

Index(['unique_id', 'date_time_of_pickup', 'no_of_passenger'], dtype='object')

In [37]:
df.columns

Index(['unique_id', 'amount', 'date_time_of_pickup', 'longitude_of_pickup',
       'latitude_of_pickup', 'longitude_of_dropoff', 'latitude_of_dropoff',
       'no_of_passenger'],
      dtype='object')

In [None]:
cat_cols.drop("amount",axis=1,inplace=True)

In [41]:
cat_cols

Unnamed: 0,unique_id,date_time_of_pickup,no_of_passenger
0,26:21.0,2009-06-15 17:26:21 UTC,1
1,52:16.0,2010-01-05 16:52:16 UTC,1
2,35:00.0,2011-08-18 00:35:00 UTC,2
3,30:42.0,2012-04-21 04:30:42 UTC,1
4,51:00.0,2010-03-09 07:51:00 UTC,1
...,...,...,...
49995,25:15.0,2013-06-12 23:25:15 UTC,1
49996,19:18.0,2015-06-22 17:19:18 UTC,1
49997,53:00.0,2011-01-30 04:53:00 UTC,1
49998,09:00.0,2012-11-06 07:09:00 UTC,2


In [42]:
numerical_cols.head()

Unnamed: 0,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,40.721319,-73.84161,40.712278,1
1,40.711303,-73.979268,40.782004,1
2,40.76127,-73.991242,40.750562,2
3,40.733143,-73.991567,40.758092,1
4,40.768008,-73.956655,40.783762,1


In [43]:
cat_cols.head()

Unnamed: 0,unique_id,date_time_of_pickup,no_of_passenger
0,26:21.0,2009-06-15 17:26:21 UTC,1
1,52:16.0,2010-01-05 16:52:16 UTC,1
2,35:00.0,2011-08-18 00:35:00 UTC,2
3,30:42.0,2012-04-21 04:30:42 UTC,1
4,51:00.0,2010-03-09 07:51:00 UTC,1


### Data Processing

In [44]:
cat_cols.head()

Unnamed: 0,unique_id,date_time_of_pickup,no_of_passenger
0,26:21.0,2009-06-15 17:26:21 UTC,1
1,52:16.0,2010-01-05 16:52:16 UTC,1
2,35:00.0,2011-08-18 00:35:00 UTC,2
3,30:42.0,2012-04-21 04:30:42 UTC,1
4,51:00.0,2010-03-09 07:51:00 UTC,1


In [45]:
catg=pd.get_dummies(cat_cols,drop_first=True)


In [46]:
numerical_cols.head()

Unnamed: 0,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,40.721319,-73.84161,40.712278,1
1,40.711303,-73.979268,40.782004,1
2,40.76127,-73.991242,40.750562,2
3,40.733143,-73.991567,40.758092,1
4,40.768008,-73.956655,40.783762,1


In [47]:
dfn = pd.concat([numerical_cols,catg],axis=1)
dfn.head()

Unnamed: 0,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger,no_of_passenger.1,unique_id_00:01.0,unique_id_00:02.0,unique_id_00:03.0,unique_id_00:04.0,unique_id_00:05.0,...,date_time_of_pickup_2015-06-30 06:45:23 UTC,date_time_of_pickup_2015-06-30 08:29:06 UTC,date_time_of_pickup_2015-06-30 10:58:55 UTC,date_time_of_pickup_2015-06-30 15:45:14 UTC,date_time_of_pickup_2015-06-30 16:05:50 UTC,date_time_of_pickup_2015-06-30 17:59:31 UTC,date_time_of_pickup_2015-06-30 19:14:39 UTC,date_time_of_pickup_2015-06-30 19:42:23 UTC,date_time_of_pickup_2015-06-30 20:50:04 UTC,date_time_of_pickup_2015-06-30 22:42:39 UTC
0,40.721319,-73.84161,40.712278,1,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,40.711303,-73.979268,40.782004,1,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,40.76127,-73.991242,40.750562,2,2,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,40.733143,-73.991567,40.758092,1,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,40.768008,-73.956655,40.783762,1,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [48]:
dfn.rename(columns={'CLV':'CLV'},inplace=True)

In [None]:
X=dfn.drop(['CLV'],axis=1)
y=dfn['CLV']

In [51]:
numerical_cols.rename(columns={'CLV':'CLV'},inplace=True)

In [55]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
model = lr.fit(X_train,y_train)
print(f'R^2 score for train: {lr.score(X_train, y_train)}')
print(f'R^2 score for test: {lr.score(X_test, y_test)}')

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [None]:
print('RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)))
print("MAE",mean_absolute_error(y_test,y_pred))
print('R-squared:',r2_score(y_test,y_pred)) 

In [None]:
q1 = df['fare_amount'].quantile(0.25)
q3 = df['fare_amount'].quantile(0.75)
iqr = q3 - q1
lower_limit = q1 - 1.5 * iqr
upper_limit = q3 + 1.5 * iqr
outliers = df[(df['fare_amount'] < lower_limit) | (df['fare_amount'] > upper_limit)]
print(f'Number of outliers: {len(outliers)}')

### HeatMap

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['fare_amount'], kde=True)
plt.title('Distribution of amount')
plt.xlabel('amount')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='no_of_passenger', y='fare_amount', data=df)
plt.title('Relationship between Fare Amount and Number of Passengers')
plt.xlabel('number ')
plt.ylabel('amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='no_of_passenger', y='fare_amount', data=df)
plt.title('Relationship between amount and Number ')
plt.xlabel('Number of Passengers')
plt.ylabel('amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='lattitude_of_pickup', y='fare_amount', data=df)
plt.title('Relationship between Fare Amount and Latitude of Pickup')
plt.xlabel('Latitude of Pickup')
plt.ylabel('amount')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='longitude_of_dropoff', y='fare_amount', data=df)
plt.title('Relationship between Fare Amount and Longitude of Dropoff')
plt.xlabel('Longitude of Dropoff')
plt.ylabel('Fare Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='lattitude_of_dropoff', y='fare_amount', data=df)
plt.title('Relationship between Fare Amount and Latitude of Dropoff')
plt.xlabel('Latitude of Dropoff')
plt.ylabel('amount')
plt.show()

### Regression Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the regression models
models = [LinearRegression(), SVR(), DecisionTreeRegressor(), BaggingRegressor(), AdaBoostRegressor(), RandomForestRegressor()]
model_names = ['Linear Regression', 'Support Vector Regression', 'Decision Tree Regression', 'Bagging Regression', 'AdaBoost Regression', 'Random Forest Regression']


In [None]:
# Train the regression models and evaluate their performance
for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{model_name} Mean Squared Error: {mse}')
    print(f'{model_name} R2 Score: {r2}')
    print('---------------------------------------------')

### Random Forest Regression

In [60]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)

y_pred=rf.predict(X_test)
print('RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)))
print("MAE",mean_absolute_error(y_test,y_pred))
print('R-squared:',r2_score(y_test,y_pred)) 

### Adaboost Regression

In [65]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
adaboost = AdaBoostRegressor(
    RandomForestRegressor(max_depth=5),
    n_estimators=150
)
adaboost.fit(X_train, y_train)

In [None]:
_pred = adaboost.predict(X_test)
print('RMSE:',np.sqrt(mean_squared_error(y_test,y_pred)))
print("MAE",mean_absolute_error(y_test,y_pred))
print('R-squared:',r2_score(y_test,y_pred))