# Multiple Classification models

In this Notebook we will defines Helper function to help us to build Classification Models and Compare them 

In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB



In [2]:
titanic_df=pd.read_csv("Data/titanic_processed.csv")
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,17.0,0,0,12.0,True,False,False
1,0,1,1,36.0,1,0,78.85,False,False,True
2,0,3,0,14.5,1,0,14.4542,True,False,False
3,1,2,0,42.0,0,0,13.0,False,False,True
4,1,2,0,41.0,0,1,19.5,False,False,True


In [3]:
FEATURES= list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
result_dict={}

In [5]:
def summerize_classification(y_test,y_pred):
    """
    This function take the actual values, and the predicted values from a model
    and quickly summerize the differnet scors
    """

    acc= accuracy_score(y_test,y_pred,normalize=True)
    num_acc= accuracy_score(y_test,y_pred,normalize=False)

    prec= precision_score(y_test,y_pred)
    recall= recall_score(y_test,y_pred)

    return {"Accuracy":acc,
            "Precision":prec,
            "Recall": recall,
            "Accuracy_count":num_acc
            }


In [6]:
def build_model(classifier_fn, name_of_y_col,names_of_x_cols,dataset,test_frac=0.2):
    """
    This function will help use to build and train different Classification models
    """
    X= dataset[names_of_x_cols]
    Y= dataset[name_of_y_col]

    x_train, x_test, y_train, y_test= train_test_split(X,Y,test_size=test_frac)

    model= classifier_fn(x_train,y_train)
    y_pred= model.predict(x_test)

    y_pred_train= model.predict(x_train)

    train_summary= summerize_classification(y_train,y_pred_train)
    test_summary= summerize_classification(y_test,y_pred)

    pred_result= pd.DataFrame({'y_test':y_test,
                               'y_pred':y_pred})
    
    model_corsstab= pd.crosstab(pred_result.y_pred,pred_result.y_test)

    return {
        "training" :train_summary,
        "test": test_summary,
        "confusion_matrix": model_corsstab
    }


In [7]:
def compare_result():
    """
    Quickly Comapre the Result of the different classification models 
    """
    for key in result_dict:
        print("Classfication ", key)
        print()

        print("Training Data")
        for score in result_dict[key]["training"]:
            print(score, result_dict[key]['training'][score])
        
        print()

        print("Test Data")
        for score in result_dict[key]["test"]:
            print(score, result_dict[key]['test'][score])
        
        print()

In [8]:
def logistic_fn(x_train,y_train):

    model = LogisticRegression(solver='liblinear')
    model.fit(x_train,y_train)
    return model

In [9]:
result_dict['survived - logistic']= build_model(logistic_fn,
                                                "Survived",
                                                FEATURES,
                                                titanic_df)
compare_result()

Classfication  survived - logistic

Training Data
Accuracy 0.7943760984182777
Precision 0.7761194029850746
Recall 0.6842105263157895
Accuracy_count 452.0

Test Data
Accuracy 0.8251748251748252
Precision 0.8431372549019608
Recall 0.7166666666666667
Accuracy_count 118.0



In [10]:
# now let's try another models / estimators


def cla_SVC(x_train,y_train):
    model = SVC()
    model.fit(x_train,y_train)
    return model




def Guissian(x_train,y_train):
    model = GaussianNB()
    model.fit(x_train,y_train)
    return model



def Decision_Tree(x_train,y_train):
    model =DecisionTreeClassifier()
    model.fit(x_train,y_train)
    return model

def Random_Forest(x_train,y_train):
    model=RandomForestClassifier()
    model.fit(x_train,y_train)
    return model


def build_and_compare_models(classifiers, name_of_y_col, names_of_x_cols, dataset):
    result_dict = {}
    
    for classifier_fn, name in classifiers:
        result_dict[name] = build_model(classifier_fn, name_of_y_col, names_of_x_cols, dataset)
    
    return result_dict

# Define your classifiers and their corresponding function
classifiers = [
    (logistic_fn, ' logistic'),
    (cla_SVC, ' SVC'),
    (Decision_Tree, ' DecisionTree'),
    (Random_Forest, ' RandomForest'),
    (Guissian, ' Guissian'),
    # You can add more classifiers here as needed
]

# Now call the function to build and compare models
result_dict = build_and_compare_models(classifiers, "Survived", FEATURES, titanic_df)

In [11]:
compare_result()

Classfication   logistic

Training Data
Accuracy 0.8101933216168717
Precision 0.806930693069307
Recall 0.7025862068965517
Accuracy_count 461.0

Test Data
Accuracy 0.7552447552447552
Precision 0.6779661016949152
Recall 0.7142857142857143
Accuracy_count 108.0

Classfication   SVC

Training Data
Accuracy 0.671353251318102
Precision 0.7264150943396226
Recall 0.3276595744680851
Accuracy_count 382.0

Test Data
Accuracy 0.6923076923076923
Precision 0.6363636363636364
Recall 0.39622641509433965
Accuracy_count 99.0

Classfication   DecisionTree

Training Data
Accuracy 0.9876977152899824
Precision 1.0
Recall 0.9698275862068966
Accuracy_count 562.0

Test Data
Accuracy 0.6923076923076923
Precision 0.5967741935483871
Recall 0.6607142857142857
Accuracy_count 99.0

Classfication   RandomForest

Training Data
Accuracy 0.9894551845342706
Precision 0.9911111111111112
Recall 0.9823788546255506
Accuracy_count 563.0

Test Data
Accuracy 0.8111888111888111
Precision 0.8269230769230769
Recall 0.70491803278688

This is very hard to View, 
Let's do some Visualization with Lovely Plotly

In [12]:
import pandas as pd
import plotly.express as px

def plot_model_comparisons(result_dict):
    metrics = []
    
    for model_name, results in result_dict.items():
        train_summary = results['training']
        test_summary = results['test']
        
        metrics.append({
            'Model': model_name,
            'Train Accuracy': train_summary['Accuracy'],
            'Test Accuracy': test_summary['Accuracy'],
            'Train Precision': train_summary['Precision'],
            'Test Precision': test_summary['Precision'],
            'Train Recall': train_summary['Recall'],
            'Test Recall': test_summary['Recall'],
        })
    
    metrics_df = pd.DataFrame(metrics)

    metrics_melted = metrics_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

    fig = px.bar(metrics_melted, 
                 x='Model', 
                 y='Score', 
                 color='Metric', 
                 barmode='group',
                 title='Model Comparison',
                 labels={'Score': 'Scores', 'Model': 'Models'},
                 height=600)

    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

plot_model_comparisons(result_dict)


As Expected for Decisioin Tree and Random Forest => they are on overfit

for more info about this problem check:
- https://www.quora.com/Why-do-decision-trees-have-a-tendency-to-overfit-to-the-training-set
- https://datascience.stackexchange.com/questions/84254/why-is-large-decision-tree-likely-to-overfit

* Of course, you can ask ChatGPT :)