In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.experimental import enable_halving_search_cv, enable_iterative_imputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, HalvingGridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error,mean_absolute_percentage_error,recall_score,precision_score,f1_score,roc_auc_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer, Binarizer, LabelEncoder
df=sns.load_dataset('tips')
df


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [18]:
#Using Label Encoder
category_columns = ['sex', 'smoker', 'day', 'time']
label_encoder = LabelEncoder()
for col in category_columns:
    df[col] = label_encoder.fit_transform(df[col])

X=df.drop('tip',axis=1)
y=df['tip']
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.50,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,1,0,3
240,27.18,2.00,0,1,1,0,2
241,22.67,2.00,1,1,1,0,2
242,17.82,1.75,1,0,1,0,2


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models={
    'Linear Regression':LinearRegression(),
    'Decision Tree':DecisionTreeRegressor(),
    'Random Forest':RandomForestRegressor(),
    'Support Vector Regression':SVR(),
    'Gradient Boosting':GradientBoostingRegressor(),
    'XGBoost':XGBRegressor()
}
results={}
for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    #printing all the metrics for regression
    results[name] = {
        'MSE':mean_squared_error(y_test,y_pred),
        'MAE':mean_absolute_error(y_test,y_pred),
        'R2':r2_score(y_test,y_pred),
        'MAPE':mean_absolute_percentage_error(y_test,y_pred),
        
        
    }

# Sorting the results based on R2 score
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['R2'], reverse=True))

# Displaying the sorted results
for name, metrics in sorted_results.items():
    print(f"{name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    print()

Support Vector Regression:
  MSE: 0.5383
  MAE: 0.5707
  R2: 0.5693
  MAPE: 0.2440

Linear Regression:
  MSE: 0.6948
  MAE: 0.6704
  R2: 0.4441
  MAPE: 0.2786

XGBoost:
  MSE: 0.7389
  MAE: 0.6722
  R2: 0.4088
  MAPE: 0.2783

Gradient Boosting:
  MSE: 0.8094
  MAE: 0.7310
  R2: 0.3524
  MAPE: 0.3047

Random Forest:
  MSE: 0.9627
  MAE: 0.7699
  R2: 0.2298
  MAPE: 0.3193

Decision Tree:
  MSE: 1.2375
  MAE: 0.8606
  R2: 0.0100
  MAPE: 0.3583



In [20]:
df=sns.load_dataset('diamonds')
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [21]:
#Chaning categoric to numeric
category_columns = ['cut', 'color', 'clarity']
label_encoder = LabelEncoder()
for col in category_columns:
    df[col] = label_encoder.fit_transform(df[col])
X=df.drop('price',axis=1)
y=df['price']
df   

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,2,0,2,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,1,0,2,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,4,0,2,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,3,4,3,61.0,58.0,2757,6.15,6.12,3.74


In [27]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



#Hyperparameter tuning and best model selection

models = {
    'Linear Regression': (
        LinearRegression(n_jobs=-3), 
        {}
    ),
    'Decision Tree': (
        DecisionTreeRegressor(), 
        {
            'max_depth': range(1, 10),
            'min_samples_split': range(2, 10),
        }
    ),
    'Random Forest': (
        RandomForestRegressor(n_jobs=-3), 
        {
            'n_estimators': range(10, 100, 10),
            'max_depth': range(1, 10),
            'min_samples_split': range(2, 10)
        }
    ),
    'KNeighbors': (
        KNeighborsRegressor(), 
        {
            'n_neighbors': range(1, 20),
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
    ),
    'Gradient Boosting': (
        GradientBoostingRegressor(), 
        {
            'n_estimators': range(10, 100, 10),
            'learning_rate': [0.1, 0.01, 0.001]
        }
    ),
    'XGBoost': (
        XGBRegressor(n_jobs=-3),
        {
            'n_estimators': range(10, 100, 10),
            'learning_rate': [0.1, 0.01, 0.001],
            'max_depth': range(1, 10)
        }
    )
}

In [25]:
for name, (model, param_grid) in models.items():
    model.fit(X_train, y_train)  # ✅ this works now
    y_pred = model.predict(X_test)

    results[name] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred),
        'MAPE': mean_absolute_percentage_error(y_test, y_pred),
    }

# Sorting the results based on R2 score
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['MSE'], reverse=True))
# Displaying the sorted results
for name, metrics in sorted_results.items():
    print(f"{name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    print()


Linear Regression:
  MSE: 1825912.9915
  MAE: 858.7085
  R2: 0.8851
  MAPE: 0.3887

KNeighbors:
  MSE: 795139.3988
  MAE: 480.7688
  R2: 0.9500
  MAPE: 0.1368

Decision Tree:
  MSE: 517333.3253
  MAE: 351.4736
  R2: 0.9675
  MAPE: 0.0840

Gradient Boosting:
  MSE: 430010.6508
  MAE: 364.9068
  R2: 0.9729
  MAPE: 0.1165

XGBoost:
  MSE: 297651.2769
  MAE: 277.9413
  R2: 0.9813
  MAPE: 0.0726

Random Forest:
  MSE: 294995.0554
  MAE: 269.0502
  R2: 0.9814
  MAPE: 0.0642

