In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


In [2]:
# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [4]:
df=sns.load_dataset('tips')
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
X=df.drop('tip', axis=1)
y=df['tip']

In [6]:
# Label encode the categorical variables
le=LabelEncoder()
X["day"]=le.fit_transform(X["day"])
X["sex"]=le.fit_transform(X["sex"])
X["time"]=le.fit_transform(X["time"])
X["smoker"]=le.fit_transform(X["smoker"])


In [7]:
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

      # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}") 

Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for GradientBoostingRegressor is  0.72
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.76
Mean Absolute error for DecisionTreeRegressor is  0.89


In [8]:
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

      # make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
    # selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 

R_squared Score DecisionTreeRegressor is  0.11
R_squared Score RandomForestRegressor is  0.26
R_squared Score KNeighborsRegressor is  0.33
R_squared Score GradientBoostingRegressor is  0.36
R_squared Score XGBRegressor is  0.41
R_squared Score LinearRegression is  0.44
R_squared Score SVR is  0.57


In [9]:
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

      # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
    # selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('mean squared error', f"{model[0]} is {model[1]: .2f}") 

mean squared error SVR is  0.54
mean squared error LinearRegression is  0.69
mean squared error XGBRegressor is  0.74
mean squared error GradientBoostingRegressor is  0.81
mean squared error KNeighborsRegressor is  0.84
mean squared error RandomForestRegressor is  1.00
mean squared error DecisionTreeRegressor is  1.38


# For hyper parameter tuning

In [10]:
diamonds=sns.load_dataset('diamonds')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [11]:
models={
"Linear Regression": (LinearRegression(), {}),
"Random Forest": (RandomForestRegressor(), {"n_estimators": [10,100]}),
"KNN": (KNeighborsRegressor(), {"n_neighbors": np.arange(1, 100,2)}),
"Decision Tree": (DecisionTreeRegressor(), {"max_depth": [None, 5, 10]}),
"SVR": (SVR(), {"kernel": ["rbf", "poly", "sigmoid"]}),
"Gradient Boosting": (GradientBoostingRegressor(), {"n_estimators": [10, 100]}),
"XGBRegressor": (XGBRegressor(), {"n_estimators": [10, 100]})
}

In [12]:
for name, (model, params) in models.items():
    pipeline = GridSearchCV(model, param_grid=params, cv=5, n_jobs=-1)
    # fit the pipeline with the training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
     # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

Linear Regression MSE:  0.6948129686287711
Linear Regression R2:  0.4441368826121931
Linear Regression MAE:  0.6703807496461157


Random Forest MSE:  0.9864906842857158
Random Forest R2:  0.21078936087897537
Random Forest MAE:  0.7864306122448984


KNN MSE:  0.6640950568462677
KNN R2:  0.4687117753876745
KNN MAE:  0.6203721488595437


Decision Tree MSE:  0.8774153020453994
Decision Tree R2:  0.2980516670532909
Decision Tree MAE:  0.7189481629481629


SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


Gradient Boosting MSE:  0.8106801524004928
Gradient Boosting R2:  0.351441010654877
Gradient Boosting MAE:  0.7657809818712309


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472




In [13]:
# # Create a dictionaries of list of models to evaluate performance with hyperparameters
# models = { 
#           'LinearRegression' : (LinearRegression(), {}),
#           'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
#           'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
#           'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
#           'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
#           'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
#           'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
#           }

# # train and predict each model with evaluation metrics as well making a for loop to iterate over the models

# for name, (model, params) in models.items():
#     # create a pipline
#     pipeline = GridSearchCV(model, params, cv=5)
    
    # # fit the pipeline
    # pipeline.fit(X_train, y_train)
    
    # # make prediction from each model
    # y_pred = pipeline.predict(X_test)
    
      
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')

# Add preprocessor inside the pipeline

In [None]:
# # make a preprocessor

# preprocessor = ColumnTransformer(
#     transformers=['numeric_scaling', StandardScaler(), ['total_bill', 'size']], remainder='passthrough')


# # Create a dictionaries of list of models to evaluate performance with hyperparameters
# models = { 
#           'LinearRegression' : (LinearRegression(), {}),
#           'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
#           'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
#           'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
#           'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
#           'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
#           'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
#           }

# # train and predict each model with evaluation metrics as well making a for loop to iterate over the models

# for name, (model, params) in models.items():
#     # create a pipline with preprocessor
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) 
     
#     # make a grid search cv to tune the hyperparameter
#     grid_search = GridSearchCV(pipeline, params, cv=5)
    
    
#     # fit the pipeline
#     grid_search.fit(X_train, y_train)
    
#     # make prediction from each model
#     y_pred = grid_search.predict(X_test)
    
      
#     # print the performing metric
#     print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
#     print(name, 'R2: ', r2_score(y_test, y_pred))
#     print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
#     print('\n')  
    

In [14]:
best_model = None
best_score = float('inf')  # Initialize with a high value for comparison

# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
    'LinearRegression' : (LinearRegression(), {}),
    'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
    'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000]}),
    'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
    'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100, 1000]}),
    'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000]}),          
}

# Train and predict each model with evaluation metrics as well making a for loop to iterate over the models
for name, (model, params) in models.items():
    # Create a pipeline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Get the best parameters and score for the current model
    best_params = pipeline.best_params_
    best_score = pipeline.best_score_
    
    # Make prediction from the best model
    y_pred = pipeline.predict(X_test)
    
    # Print the performance metrics
    print(name, 'Best Parameters: ', best_params)
    print(name, 'Best Score: ', best_score)
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')
    
    # Update the best model if the current model has a better score
    if best_score < best_score:
        best_model = pipeline

# Print the best model
print('Best Model:', best_model)


LinearRegression Best Parameters:  {}
LinearRegression Best Score:  0.3191967183987491
LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


SVR Best Parameters:  {'kernel': 'poly'}
SVR Best Score:  0.3132008857431346
SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


DecisionTreeRegressor Best Parameters:  {'max_depth': 5}
DecisionTreeRegressor Best Score:  0.1152383554846556
DecisionTreeRegressor MSE:  0.8774153020453994
DecisionTreeRegressor R2:  0.2980516670532909
DecisionTreeRegressor MAE:  0.718948162948163


RandomForestRegressor Best Parameters:  {'n_estimators': 1000}
RandomForestRegressor Best Score:  0.33318256418167247
RandomForestRegressor MSE:  0.9330679599448861
RandomForestRegressor R2:  0.25352852009479754
RandomForestRegressor MAE:  0.7691720408163248


KNeighborsRegressor Best Parameters:  {'n_neighbors': 17}
KNeighborsRegressor Best Score:  0.335728494432567

# Classifiers:

In [25]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [27]:

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target


In [28]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

KFold=KFold(n_splits=5, shuffle=True, random_state=42)
for name, classifier in classifiers.items():
    scores=cross_val_score(classifier, X_train, y_train, cv=KFold)
    Accuracy= np.mean(scores)
    print(f"{name} Accuracy: {Accuracy:.2f}")
   


Logistic Regression Accuracy: 0.95
Decision Tree Accuracy: 0.93
Random Forest Accuracy: 0.93
SVM Accuracy: 0.96
KNN Accuracy: 0.95
