In [None]:
#Import the dependancies
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,r2_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [26]:


# Load the Titanic dataset from Seaborn
titanic_data = sns.load_dataset('titanic')

# Select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ("LogisticRegression",LogisticRegression(random_state=42)),
    ("Support vector machine ",SVC(random_state=42)),
    
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    
    
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy / Predication:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print("Best Model:", best_model,"with accuracy",best_accuracy)

Model: Random Forest
Cross-validation Accuracy: 0.7991529597163399
Test Accuracy / Predication: 0.8379888268156425

Model: Gradient Boosting
Cross-validation Accuracy: 0.8061952132374668
Test Accuracy / Predication: 0.7988826815642458

Model: XGBoost
Cross-validation Accuracy: 0.8076233625529401
Test Accuracy / Predication: 0.7932960893854749

Model: LogisticRegression
Cross-validation Accuracy: 0.7977839062346105
Test Accuracy / Predication: 0.8100558659217877

Model: Support vector machine 
Cross-validation Accuracy: 0.8160248202501723
Test Accuracy / Predication: 0.8044692737430168

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))]) with accuracy 0.8379888268156425


# Other  method

In [None]:
#For regression
#Import the library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#For preprocessing 
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split

#For models 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB

#Load the data set 
df=sns.load_dataset("tips")

#Step = 2
#Preprocessing the data

#Other method for label encoder
for i in df.columns:
    df[i]=LabelEncoder().fit_transform(df[i])
    #Apply the standardscaler on  total bill and tips 
    #Scaler decrease the MSE 
    df[["total_bill","tip"]]=MinMaxScaler().fit_transform(df[["total_bill","tip"]])#This is best second Max absolute
    
    #step =3
    #separate the feature and target value
    X=df.drop('tip',axis=1)
    Y=df['tip']
    
    X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
    
    #Step =4 
    #Create a dictionaries of list of models to evaluate performance

models={
    
    "LinearRegression": LinearRegression(),
    "SVR":SVR(),
    "KNN":KNeighborsRegressor(n_neighbors=5),
    "Decision Tree":DecisionTreeRegressor(random_state=42),
    "random forest":RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor":GradientBoostingRegressor(random_state=42),
    "XGBRegressor":XGBRegressor()
}


#Steps=5

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
model_best=None
best_accuracy = 0
for name,model in models.items():
    #Train the model
    model.fit(X_train,Y_train)
    
    #Predict the model
    Y_pred=model.predict(X_test)
    mse=mean_squared_error(Y_test,Y_pred)
    r2=r2_score(Y_test,Y_pred)
    #Evaluate the model
    print("Model = ",name)
    print(f"MSE = {mse:.3f}")
    print(f"R2 score = {r2:.3f} ")
    print()
    
    # Check if the current model has the best accuracy
    if r2 > best_accuracy:
        best_accuracy = r2
        model_best = model

# Retrieve the best model
print("Best Model: ", model_best,f"with accuracy = {best_accuracy:.04f} ")

Model =  LinearRegression
MSE = 0.035
R2 score = 0.500 

Model =  SVR
MSE = 0.038
R2 score = 0.465 

Model =  KNN
MSE = 0.056
R2 score = 0.209 

Model =  Decision Tree
MSE = 0.063
R2 score = 0.109 

Model =  random forest
MSE = 0.044
R2 score = 0.375 

Model =  GradientBoostingRegressor
MSE = 0.044
R2 score = 0.383 

Model =  XGBRegressor
MSE = 0.045
R2 score = 0.364 

Best Model:  LinearRegression() with accuracy = 0.5002 


# for classification

In [28]:
#For regression
#Import the library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#For preprocessing 
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

#For models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from xgboost import XGBClassifier

#import the data set
data=sns.load_dataset("titanic")

#Step =2
#Preprocessing the data

#Remove the deck data
data.drop("deck",axis=1,inplace=True)

#Fill the missing value
data["age"]=data["age"].fillna(data["age"].mean())
for i in data.columns:
    if data[i].dtype=="category" or data[i].dtype=='object':
        data[i]=data[i].fillna(data[i].mode()[0])
        data[i]=LabelEncoder().fit_transform(data[i])
        data[['age','fare']]=MinMaxScaler().fit_transform(data[['age','fare']])
#step 3

#Train test and split the data
X=data.drop("survived",axis=1)
Y=data['survived']

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

# step=4

#Create  dictonary of the model

model={
    "LogisticRegression ":LogisticRegression(random_state=42),
    "SVC":SVC(),
    "KNeighborsClassifier":KNeighborsClassifier(n_neighbors=5),
    "Decision Tree classifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "XGBClassifier":XGBClassifier(),
    "Gradient boost classifier":GradientBoostingClassifier()
    
}
#Step=6

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models
best_accuracy=0.0
best_model=None
for name,models in model.items():
    
    #Train the model
    models.fit(X_train,Y_train)
    
    #Predict the model
    Y_pred=models.predict(X_test)
    
    #evaluate 
    test_accuracy=accuracy_score(Y_test,Y_pred)
    
    class_report=classification_report(Y_test,Y_pred)
    
    conf_matrix=confusion_matrix(Y_test,Y_pred)
    
    print("*******************************************")
    print("model ",name)
    print("test accuracy = ",test_accuracy)
    print("classfication report \n",class_report)
    print("confusion matrix \n",conf_matrix)
    print("*************************************************")
        
if test_accuracy>best_accuracy:
    best_accuracy=test_accuracy
    best_model=models
print("Best model ",best_model, "acuuracy = ",test_accuracy)
    

*******************************************
model  LogisticRegression 
test accuracy =  1.0
classfication report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        74

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179

confusion matrix 
 [[105   0]
 [  0  74]]
*************************************************
*******************************************
model  SVC
test accuracy =  0.994413407821229
classfication report 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       105
           1       1.00      0.99      0.99        74

    accuracy                           0.99       179
   macro avg       1.00      0.99      0.99       179
weighted avg       0.99      0.99      0.99       179

confusion matrix 
 [[105  