# Model Selection

In [1]:
#import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score

# Load the iris dataset from Seaborn
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [17]:
X = df.drop('species', axis=1)
y= df['species']
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Create an SVM model
model = SVC(kernel='rbf', C=1, gamma=1)
# Train the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
# Print the accuracy of the model
print('Accuracy:', accuracy)

Accuracy: 1.0


In [18]:
# lets run naive bayes classifier
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [23]:
# lets run KNN classifier
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0


# Hyperparameter tuning

In [25]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
#import the data
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int64  
 3   smoker      244 non-null    int64  
 4   day         244 non-null    int64  
 5   time        244 non-null    int64  
 6   size        244 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 13.5 KB


In [33]:
# label encode the categorical data using for loop
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in df.columns:
    if df[i].dtype=='category':
        df[i]=le.fit_transform(df[i])
        
# df["sex"] = le.fit_transform(df['sex'])
# df['smoker'] = le.fit_transform(df['smoker'])        
# df['day'] = le.fit_transform(df['day'])
# df['time'] = le.fit_transform(df['time'])

In [39]:
# Train test split the data by 80:20 ratio
from sklearn.model_selection import train_test_split
X = df.drop('tip', axis=1)
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# manual tuning

In [42]:
# model building
from sklearn.linear_model import LinearRegression
#svr
from sklearn.svm import SVR
#knn
from sklearn.neighbors import KNeighborsRegressor

lr = LinearRegression()
svr = SVR()
knn = KNeighborsRegressor()

# lets select the best model out of these three
from sklearn.model_selection import cross_val_score
lr_score = cross_val_score(lr, X_train, y_train, cv=5)
svr_score = cross_val_score(svr, X_train, y_train, cv=5)
knn_score = cross_val_score(knn, X_train, y_train, cv=5)

# lets check the mean of all the scores
print("LinearRegression:", lr_score.mean())
print("SVR:", svr_score.mean())
print("KNN:", knn_score.mean())

LinearRegression: 0.3191967183987491
SVR: 0.263354109676803
KNN: 0.22266284311011927


# Automatic method

In [46]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = sns.load_dataset('tips')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in df.columns:
    if df[i].dtype=='category':
        df[i]=le.fit_transform(df[i])
X = df.drop('tip', axis=1)
y = df['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(), 
    'Lasso Regression': Lasso(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

best_score = float('inf')
best_model = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f'{name} R^2: {r2:.3f}, MSE: {mse:.3f}')
    
    if mse < best_score:
        best_score = mse
        best_model = model
        
print(f'Best model: {best_model}')

# this is how to save the model
import pickle
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

Linear Regression R^2: 0.444, MSE: 0.695
Ridge Regression R^2: 0.446, MSE: 0.693
Lasso Regression R^2: 0.547, MSE: 0.567
SVR R^2: 0.569, MSE: 0.538
KNN R^2: 0.329, MSE: 0.838
Random Forest Regressor R^2: 0.294, MSE: 0.883
Gradient Boosting Regressor R^2: 0.348, MSE: 0.815
Best model: SVR()
