In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#dummy regressor for first submission
'''
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

X = df.drop(columns=['total_amount'])

y = df['total_amount']
dummy_regressor = DummyRegressor(strategy='mean')  # base line model for taxi fare prediction, we can also use median

dummy_regressor.fit(X,y)

xtest=pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")
ypdc = dummy_regressor.predict(xtest)
print(ypdc)'''

In [None]:
#first submission
'''
submission = pd.DataFrame(columns = ["ID","total_amount"])
submission["ID"] = [i for i in range(1,len(ypdc)+1)]
submission["total_amount"] = ypdc
submission.to_csv('submission.csv',index=False)
'''

> **EXPLORATORY DATA ANALYSIS**

In [None]:
df_test=pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
df_train=pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

#information and related statistics about the test dataset given
# ANSI escape codes for bold text
stats = "\033[1mStats:\033[0m"

print(stats)
df_test.head()
df_test.info()
df_test.describe()
#to be predicted: total_amount

In [None]:
#information and stats about the training dataset given
stats = "\033[1mStats:\033[0m"

print(stats)
df_train.head()
df_train.info()
df_train.describe()

#target variable: total_amount

In [None]:
#scatter plot for trip distance vs total amount  (we can see certain outliers)
#and payment type vs total amount
import matplotlib.pyplot as plot
import seaborn as sea

sea.scatterplot(x='trip_distance', y='total_amount', data=df_train)
plot.title('Scatter Plot (trip_distance vs total_amount)')  #title
plot.show()

sea.scatterplot(x='payment_type', y='total_amount', data=df_train)
plot.title('Scatter Plot (payment_type vs total_amount)')    #title
plot.show()

In [None]:
#this is a count plot, for categorical columns (with value_counts())

categorical_columns = ['VendorID', 'payment_type', 'store_and_fwd_flag']

for col in categorical_columns:
    sea.countplot(x=col, data=df_train)  
    plot.title(f'Count Plot ({col})')    
    plot.show()
    value_counts = df_train[col].value_counts()   #gives the value counts for each category 
    print(f'Value counts for {col}:')
    print(value_counts)
    print('\n' + '=============================================' + '\n')


In [None]:
# this is a simple visualization for total amount and its occurrences (with kde)

plot.figure(figsize=(12, 6))                       #canvas size
sea.histplot(df_train['total_amount'], kde=True)     #automatically trains the data to fit in the range - kde gives us an estimation line
plot.title('Training Data (total_amount)')   #title of the graph
plot.xlabel('total_amount')                  #x axis
plot.ylabel('frequency')                     #y axis
plot.show()

print('v ========== With just KDE ============= v')

sea.kdeplot(df_train['total_amount'])
plot.title('Kernel Density Estimate (total_amount)')  #title
plot.xlabel('total_amount')   #x axis
plot.ylabel('density')      #y axis
plot.show()

In [None]:
#boxplot for understanding the distribution
#and detecting outliers

sea.boxplot(x='passenger_count', y='total_amount', data=df_train)
plot.title('Box Plot (total_amount vs passenger_count)')
plot.show()

In [None]:
#pair plot for passenger_count, trip_distance, extra, tip_amount, total_amount
sample_df_train = df_train.sample(n=1000, random_state=42)       #random sample of 1000 rows for quicker plotting

sea.pairplot(sample_df_train[['passenger_count', 'trip_distance', 'extra', 'tip_amount', 'total_amount']])
plot.suptitle('Pair Plot of Selected Variables', y='1.02')  # Adjust title position
plot.show()

In [None]:
#correlation matrix (typically created for columns with numerical values)
num_col = df_train.select_dtypes(include=['number']).columns    #columns with numerical values only
correlation_matrix = df_train[num_col].corr()

plot.figure(figsize=(12,10))
sea.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")  #annot=true gives the numbers, coolwarm the color range and fmt the decimal range
plot.title('Correlation Heatmap')     #correlation coefficient can help us determine the strength and direction of linear relation bw 2 vars
plot.show()

> **MODELS**

In this competition, I have chosen three models to work upon: (in case, the 3 models don't fit satisfactorily, I may add some additional models):
1. Linear Regression Model
2. Gradient Boosting Regression Model
3. K-Nearest Neighbours (KNN) Regression Model

Firstly I hyperparameter tune both linear and gradient boosting model so that the models can work optimally. (I chose to do hyperparameter tuning of KNN after I had run both the initial models)
As to why I chose these models:

1) Linear Regression Model

I wanted to start with a simple baseline model, and pick the pace up from there. Linear regression works perfectly as my first model, since the relationship between the input variables and output is expressed a linear equation, which is simple to understand. It particularly performs well when the independent and dependent variables and approximately linear. I can compare this simple benchmark model with the more complex model I plan to use later on. This model is also efficient and can be quickly trained on even large datasets.

2) Gradient boosting regressiono model

This model is relatively complex, and therefore it can model complex, non- linear data. It is capable of capturing intricate patterns. Its also robust to outliers in the data. This model also allows for much fine hyperparameter tuning, which can lead to more accuracy in predictions. Parameters like learning rate, estimators etc, to achieve desired balance. Despite its advauntages it may require careful hyper tuning and could be computationally more expensive to simpler models like linear regression. 

3) KNN Regression Model

I wanted to try out one more model which was relatively simpler and easy to implement, but can also, unlike linear regression, not assume linear relationship between input features and target variable. The localized approach of KNN model can help us identify local patterns and variations in the data. It is suitable for apps where underlying pattern may evolve, since it makes no assumptions about the distribution of data. 

*Hyperparameters calculation for Linear Regression and Gradient Boosting.*

In [11]:
'''
#hyperparameter for the dataset (I have used linear regression, gradient boosting and KNN regressors, but the
#hyperparameter traning for KNN is done after linear and gradient boosting regressions)

import pandas as pd           #data manipulation and analysis
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor     #used gradient boosting
from sklearn.linear_model import LinearRegression     #used linear regression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


#loading the training data
df_train = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

#data Preprocessing
num_cols = df_train.select_dtypes(include=['number']).columns        #find all the numerical columns and store them
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())   #fill the missing values with their mean
#convert 'tpep_pickup_datetime' to datetime format:
df_train['tpep_pickup_datetime'] = pd.to_datetime(df_train['tpep_pickup_datetime'])  
#creates new features: 'hour_of_day' and 'day_of_week' based on the pickup time.
df_train['hour_of_day'] = df_train['tpep_pickup_datetime'].dt.hour
df_train['day_of_week'] = df_train['tpep_pickup_datetime'].dt.dayofweek
df_train = pd.get_dummies(df_train, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

#here  we split the data into feature x and target y
X = df_train.drop(columns=['total_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'])
y = df_train['total_amount']

#data gets split into training and validation sets using 80-20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#models used (liner and gradient regression)
models = {
    'linear regression': (LinearRegression(), {'fit_intercept': [True, False]}),
    'gradient boosting': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]})
}

# model iteration, perform hyperparameter tuning using gridsearchcv
for model_name, (model, param_grid) in models.items():
    print(f"\nhyperparameter tuning {model_name}...")
    
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)
    
    # best hyper parameters
    best_params = grid_search.best_params_
    print(f"best hyperparameters: {best_params}")
    
    # validation set evaluation (common metric for regression tasks)
    val_predictions = grid_search.best_estimator_.predict(X_val)
    mse = mean_squared_error(y_val, val_predictions)
    rmse = mse**0.5
    print(f"Root mean squared error (RMSE) on Validation Set: {rmse}")

    # cross validation (5-fold) prints the cross validated RMSE for each model
    c_val_results = cross_val_score(grid_search.best_estimator_, X, y, scoring='neg_mean_squared_error', cv=5)
    c_val_rmse = (-c_val_results.mean())**0.5
    print(f"Cross-validation RMSE: {c_val_rmse}")
    '''


Hyperparameter tuning for linear regression...
Best Hyperparameters: {'fit_intercept': True}
Root Mean Squared Error (RMSE) on Validation Set: 13.444026265491303
Cross-Validation RMSE: 13.358983536025578

Hyperparameter tuning for gradient boosting...
Best Hyperparameters: {'learning_rate': 0.2, 'n_estimators': 200}
Root Mean Squared Error (RMSE) on Validation Set: 5.535505400161676
Cross-Validation RMSE: 6.25698315049157


hyperparameter tuning linear regression...
best hyperparameters: {'fit_intercept': True}
Root mean squared error (RMSE) on Validation Set: 13.444026265491303
Cross-validation RMSE: 13.358983536025578

hyperparameter tuning gradient boosting...
best hyperparameters: {'learning_rate': 0.2, 'n_estimators': 200}
Root mean squared error (RMSE) on Validation Set: 5.535505400161676
Cross-validation RMSE: 6.25698315049157

***Linear Regression :***

In [18]:
#linear regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# loading the training dataset
df_train = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

In [19]:
# data preprocessing and filling the missing values with the mean
num_cols = df_train.select_dtypes(include=['number']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())

# feature engineering (similar to the one hyperparameter tuning) 
#convert 'tpep_pickup_datetime' to datetime format and create new features based on the pickup time
df_train['tpep_pickup_datetime'] = pd.to_datetime(df_train['tpep_pickup_datetime'])
df_train['hour_of_day'] = df_train['tpep_pickup_datetime'].dt.hour
df_train['day_of_week'] = df_train['tpep_pickup_datetime'].dt.dayofweek

# encoding categorical variables
df_train = pd.get_dummies(df_train, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

In [20]:
# splitting the dataset into features x and target y
X_train = df_train.drop(columns=['total_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'])
y_train = df_train['total_amount']

# training and initializing our model
model = LinearRegression()
model.fit(X_train, y_train)

# predictions on the train
predictions_train = model.predict(X_train)

# evaluating the r2 score
r2_train = r2_score(y_train, predictions_train)
print(f"r2 score on training set: {r2_train}")

r2 score on training set: 0.724924576235809


In [21]:
# loading test data
df_test = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')

# data preprocessing and filling the missing values with the mean
num_cols_test = df_test.select_dtypes(include=['number']).columns
df_test[num_cols_test] = df_test[num_cols_test].fillna(df_test[num_cols_test].mean())

# feature engineering (similar to the one hyperparameter tuning) 
#convert 'tpep_pickup_datetime' to datetime format and create new features based on the pickup time
df_test['tpep_pickup_datetime'] = pd.to_datetime(df_test['tpep_pickup_datetime'])
df_test['hour_of_day'] = df_test['tpep_pickup_datetime'].dt.hour
df_test['day_of_week'] = df_test['tpep_pickup_datetime'].dt.dayofweek
df_test = pd.get_dummies(df_test, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

#the same columns as used for training
X_test = df_test[X_train.columns]

# making predictions on the data set
predictions_test = model.predict(X_test)

# dataframe for the predictions
ids_test = pd.Series(range(1, len(predictions_test) + 1))
df_predictions = pd.DataFrame({'ID': ids_test, 'total_amount': predictions_test})

In [22]:
'''
# SAVING TO SUBMISSION.CSV
df_predictions.to_csv('submission.csv', index=False)'''

r2 score on training set: 0.724924576235809 (LINEAR REGRESSION)

Score in the competition: 0.7067

***GRADIENT BOOSTING REGRESSION***

In [1]:
#gradient boosting regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

# loading the training data
df_train = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')



In [2]:
# data preprocessing and filling the missing values with the mean
num_cols = df_train.select_dtypes(include=['number']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())

# feature engineering (similar to the one hyperparameter tuning) 
#convert 'tpep_pickup_datetime' to datetime format and create new features based on the pickup time
df_train['tpep_pickup_datetime'] = pd.to_datetime(df_train['tpep_pickup_datetime'])
df_train['hour_of_day'] = df_train['tpep_pickup_datetime'].dt.hour
df_train['day_of_week'] = df_train['tpep_pickup_datetime'].dt.dayofweek

# encoding categorical vars
df_train = pd.get_dummies(df_train, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)



In [3]:
# splitted data into features x and target y
X_train = df_train.drop(columns=['total_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'])
y_train = df_train['total_amount']

# training and initializing the training moddel
model = GradientBoostingRegressor(learning_rate=0.2,n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# predictions on the training set
predictions_train = model.predict(X_train)

# evaluating the model on r2 score
r2_train = r2_score(y_train, predictions_train)
print(f"r2 score on training set is : {r2_train}")

r2 score on training set is : 0.9577643163570861


In [4]:
# loading the test data
df_test = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')

# data preprocessing and filling the missing values with the mean
num_cols_test = df_test.select_dtypes(include=['number']).columns
df_test[num_cols_test] = df_test[num_cols_test].fillna(df_test[num_cols_test].mean())

# feature engg - test data
df_test['tpep_pickup_datetime'] = pd.to_datetime(df_test['tpep_pickup_datetime'])
df_test['hour_of_day'] = df_test['tpep_pickup_datetime'].dt.hour
df_test['day_of_week'] = df_test['tpep_pickup_datetime'].dt.dayofweek
df_test = pd.get_dummies(df_test, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

# same columns as used for training
X_test = df_test[X_train.columns]

# predictions on the test set
predictions_test = model.predict(X_test)

# data frame for predictions
test_ids = pd.Series(range(1, len(predictions_test) + 1))
df_predictions_final = pd.DataFrame({'ID': test_ids, 'total_amount': predictions_test})

In [9]:
df_predictions_final.head
df_predictions_final.to_csv("submission.csv", index=False)

r2 score on training set is : 0.9577643163570861 (GRADIENT BOOSTING)

Score in the competition: 0.94461

***HYPERPARAMETER TUNING FOR KNN REGRESSOR***

In [5]:
'''#hyperparameter tuning for knn regressor
# importing necessary libraries, including Pandas for data manipulation, 
# scikit-learn for machine learning, and specific modules for K-Nearest Neighbors regression.
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, r2_score

# loading the traning dataset from csv file into pandas dataframe
df_train = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

# data preprocessing
# identifying numeric columns and fill missing values in those columns with their mean.
num_cols = df_train.select_dtypes(include=['number']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())

# feature engineering
# converting the 'tpep_pickup_datetime' column to datetime format and extract the hour and day of the week as new features.
df_train['tpep_pickup_datetime'] = pd.to_datetime(df_train['tpep_pickup_datetime'])
df_train['hour_of_day'] = df_train['tpep_pickup_datetime'].dt.hour
df_train['day_of_week'] = df_train['tpep_pickup_datetime'].dt.dayofweek

# performing one-hot encoding on categorical variables ('store_and_fwd_flag', 'payment_type').
df_train = pd.get_dummies(df_train, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

# splitting the data into features (X_train) and the target variable (y_train).
X_train = df_train.drop(columns=['total_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'])
y_train = df_train['total_amount']

# create a K-Nearest Neighbors regression model.
knn = KNeighborsRegressor()

# define a grid of hyperparameter values to search over. in this case, it's the number of neighbors (n_neighbors) for the KNN model.
param_grid = {'n_neighbors': [1, 3, 5, 7, 10, 15, 20]}  # Adjust the values as needed

# create a GridSearchCV object, specifying the KNN model, the parameter grid, 
# the scoring metric (r-squared in this case), and the number of cross-validation folds.
grid_search = GridSearchCV(knn, param_grid, scoring=make_scorer(r2_score), cv=5)

# fit the GridSearchCV object to the training data. 
# this will perform an exhaustive search over the specified hyperparameter values and evaluate the model performance using cross-validation.
grid_search.fit(X_train, y_train)

# print the best parameters found by grid search
print("best params:", grid_search.best_params_)'''

best params: {'n_neighbors': 3}


best params: {'n_neighbors': 3} (KNN REGRESSOR)

***KNN REGRESSION***

In [10]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# loading the training data
df_train = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

In [11]:
# data preprocessing and filling the missing values with the mean
num_cols = df_train.select_dtypes(include=['number']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())

# feature engineering (similar to the one hyperparameter tuning) 
# convert 'tpep_pickup_datetime' to datetime format and create new features based on the pickup time
df_train['tpep_pickup_datetime'] = pd.to_datetime(df_train['tpep_pickup_datetime'])
df_train['hour_of_day'] = df_train['tpep_pickup_datetime'].dt.hour
df_train['day_of_week'] = df_train['tpep_pickup_datetime'].dt.dayofweek

# encoding categorical variables
df_train = pd.get_dummies(df_train, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

In [12]:
# splitting the data set into features (x) and target (y)
X_train = df_train.drop(columns=['total_amount', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'])
y_train = df_train['total_amount']

# Itraining and initializing the the model (knn regressor here)
model = KNeighborsRegressor(n_neighbors=3)  #from grid search parameters we know its 3
model.fit(X_train, y_train)

# predictions on the training dataset
predictions_train = model.predict(X_train)

# evaluating the r2 score of the model on the training set
r2_train = r2_score(y_train, predictions_train)
print(f"r2 score on training set: {r2_train}")

r2 score on training set: 0.9047264234032898


In [13]:
# loading the test data
df_test = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')

# data preprocessing for test data, and filling in the mean for missing values
num_cols_test = df_test.select_dtypes(include=['number']).columns
df_test[num_cols_test] = df_test[num_cols_test].fillna(df_test[num_cols_test].mean())

# feature engg - test data
df_test['tpep_pickup_datetime'] = pd.to_datetime(df_test['tpep_pickup_datetime'])
df_test['hour_of_day'] = df_test['tpep_pickup_datetime'].dt.hour
df_test['day_of_week'] = df_test['tpep_pickup_datetime'].dt.dayofweek
df_test = pd.get_dummies(df_test, columns=['store_and_fwd_flag', 'payment_type'], drop_first=True)

# same columns as training
X_test = df_test[X_train.columns]

# predictions on the test set
predictions_test = model.predict(X_test)

# dataframe for predictions
test_ids = pd.Series(range(1, len(predictions_test) + 1))
df_predictions = pd.DataFrame({'ID': test_ids, 'total_amount': predictions_test})

In [None]:
'''
# SAVING THE FILE TO SUBMISSION.CSV
df_predictions.to_csv('submission.csv', index=False)'''

r2 score on training set: 0.9047264234032898 (KNN REGRESSOR)

Score in competition: 0.77772

**SUMMARY**

Gradient boosting is the best performing model till now. With moderate performance in knn regression (but not enough to pass the cutoff) and average in linear regression 

The reason why linear regression is getting low r2 scores is because it assumes linear relationship between features and target variable. And the true relationship, it seems so, is non-linear. In contrast, both KNN and gradient boosting are flexible and can model non-linear models effectively.

Other fact is linear regression is sensitive to quality of feature engineering, while the other two are very much able to adapt.

As to why KNN r2 score is lower than gradient boosting, is due to the parameter tuning in KNN. It is highly dependent on the choice of hyperparameters (no. of neighbours), if the value of that is not optimized enough, it can impact model's performance. So there's  scope of improvement in generalization. 

In gradient boosting, adjusting the hyperparameters is helping us reduce overfitting of the data. Also it is more capable than KNN in terms of robustness to noise and handling complex relationships. It is an ensemble method, i.e it combines multiple weak learners (usually decision trees) to improve predictive performance compared to individual models, and this can lead to a higher r2 score. 