In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv(r"/content/final_internship_data.csv")


# Define target and features
target = 'fare_amount'
features = df.columns.difference([target, 'User ID', 'User Name', 'Driver Name', 'pickup_datetime'])

# Preprocessing pipeline for numerical features
numeric_features = df[features].select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
categorical_features = df[features].select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR()
}

# Prepare data
X = df[features]
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
best_model = None
best_score = float('-inf')
results = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])

    param_grid = {}
    if model_name == 'Gradient Boosting':
        param_grid = {
            'model__n_estimators': [50, 100],
            'model__learning_rate': [0.1, 0.01]
        }
    elif model_name == 'Support Vector Regressor':
        param_grid = {
            'model__C': [0.1, 1],
            'model__epsilon': [0.1, 0.01]
        }

    grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[model_name] = {
        'Best Parameters': grid_search.best_params_,
        'MSE': mse,
        'R2 Score': r2
    }

    if r2 > best_score:
        best_score = r2
        best_model = model_name

# Display results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {result['Best Parameters']}")
    print(f"MSE: {result['MSE']}")
    print(f"R2 Score: {result['R2 Score']}")
    print("")

print(f"Best Model: {best_model} with R2 Score: {best_score}")


Model: Linear Regression
Best Parameters: {}
MSE: 84.48742740235423
R2 Score: 0.06921968657983035

Model: Gradient Boosting
Best Parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 100}
MSE: 20.052269199996466
R2 Score: 0.7790883450401018

Model: Support Vector Regressor
Best Parameters: {'model__C': 1, 'model__epsilon': 0.1}
MSE: 96.85202529351916
R2 Score: -0.06699850178617228

Best Model: Gradient Boosting with R2 Score: 0.7790883450401018
