In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,fbeta_score,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Apply ML Classification techniques on "Social_Network_Ads.csv" dataset

In [24]:
df=pd.read_csv('Assignment Datasets\Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [25]:
df=df.drop(['User ID'],axis=1)

In [26]:
df=pd.get_dummies(df,columns=['Gender'],drop_first=True)
df

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [27]:
x=df[df.columns.difference(['Purchased'])]
y=df['Purchased']

In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [29]:
models={
    'LR':LogisticRegression(),
    'KNN':KNeighborsClassifier(),
    'DT':DecisionTreeClassifier(),
    'SVC':SVC(),
    'NB':GaussianNB(),
    'XGC':XGBClassifier(),
    'RF':RandomForestClassifier()
    
}

In [30]:
for name,model in  models.items():
    print(f'using {name}: ')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Training Accuracy :{accuracy_score(y_train,model.predict(x_train))}')
    print(f'Testing Accuracy :{accuracy_score(y_test,y_pred)}')
    print(f'Confusion matrix:\n {confusion_matrix(y_test,y_pred)}')
    print(f'Recall: {recall_score(y_test,y_pred)}')
    print(f'precision: {precision_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print(f'Fbeta-score: {fbeta_score(y_test,y_pred,beta=0.5)}')
    print(classification_report(y_test,y_pred))
    print('-'*33)
    

using LR: 
Training Accuracy :0.8466666666666667
Testing Accuracy :0.87
Confusion matrix:
 [[61  7]
 [ 6 26]]
Recall: 0.8125
precision: 0.7878787878787878
F1-score: 0.8
Fbeta-score: 0.7926829268292682
              precision    recall  f1-score   support

           0       0.91      0.90      0.90        68
           1       0.79      0.81      0.80        32

    accuracy                           0.87       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.87      0.87      0.87       100

---------------------------------
using KNN: 
Training Accuracy :0.9166666666666666
Testing Accuracy :0.94
Confusion matrix:
 [[63  5]
 [ 1 31]]
Recall: 0.96875
precision: 0.8611111111111112
F1-score: 0.911764705882353
Fbeta-score: 0.8806818181818183
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        68
           1       0.86      0.97      0.91        32

    accuracy                           0.94       100
   



Testing Accuracy :0.9
Confusion matrix:
 [[60  8]
 [ 2 30]]
Recall: 0.9375
precision: 0.7894736842105263
F1-score: 0.8571428571428572
Fbeta-score: 0.8152173913043479
              precision    recall  f1-score   support

           0       0.97      0.88      0.92        68
           1       0.79      0.94      0.86        32

    accuracy                           0.90       100
   macro avg       0.88      0.91      0.89       100
weighted avg       0.91      0.90      0.90       100

---------------------------------
using RF: 
Training Accuracy :0.9966666666666667
Testing Accuracy :0.92
Confusion matrix:
 [[62  6]
 [ 2 30]]
Recall: 0.9375
precision: 0.8333333333333334
F1-score: 0.8823529411764706
Fbeta-score: 0.8522727272727274
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        68
           1       0.83      0.94      0.88        32

    accuracy                           0.92       100
   macro avg       0.90      0.92      

### Best model for this data is KNeighbors model and DecisionTree model

## Apply ML Classification techniques on " Titanic.csv" dataset

In [171]:
df=pd.read_csv('Assignment Datasets\Titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [172]:
df=df.drop(['Ticket','Cabin'],axis=1)

In [173]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [174]:
df=pd.get_dummies(df,columns=['Sex','Embarked'],drop_first=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.000000,1,0,7.2500,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,71.2833,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.000000,0,0,7.9250,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,53.1000,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.000000,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.000000,0,0,13.0000,1,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",19.000000,0,0,30.0000,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,23.4500,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",26.000000,0,0,30.0000,1,0,0


In [175]:
x=df[df.columns.difference(['Survived','Name'])]
y=df['Survived']

Unnamed: 0,Age,Embarked_Q,Embarked_S,Fare,Parch,PassengerId,Pclass,Sex_male,SibSp
0,22.000000,0,1,7.2500,0,1,3,1,1
1,38.000000,0,0,71.2833,0,2,1,0,1
2,26.000000,0,1,7.9250,0,3,3,0,0
3,35.000000,0,1,53.1000,0,4,1,0,1
4,35.000000,0,1,8.0500,0,5,3,1,0
...,...,...,...,...,...,...,...,...,...
886,27.000000,0,1,13.0000,0,887,2,1,0
887,19.000000,0,1,30.0000,0,888,1,0,0
888,29.699118,0,1,23.4500,2,889,3,0,1
889,26.000000,0,0,30.0000,0,890,1,1,0


In [176]:
vector=vectorizer.fit_transform(df['Name'])
df_vect2=pd.DataFrame(vector.toarray(),columns=vectorizer.get_feature_names())
df_vect2

Unnamed: 0,aaron,abbing,abbott,abelson,abraham,achem,achille,achilles,ada,adahl,...,youseff,yousif,youssef,yousseff,yrois,zabour,zebley,zenni,zillah,zimmerman
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [177]:
x=pd.concat([df_vect2,x], axis=1)
x.head()

Unnamed: 0,aaron,abbing,abbott,abelson,abraham,achem,achille,achilles,ada,adahl,...,zimmerman,Age,Embarked_Q,Embarked_S,Fare,Parch,PassengerId,Pclass,Sex_male,SibSp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22.0,0,1,7.25,0,1,3,1,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,38.0,0,0,71.2833,0,2,1,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,26.0,0,1,7.925,0,3,3,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,0,1,53.1,0,4,1,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,0,1,8.05,0,5,3,1,0


In [178]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [179]:
models={
    'LR':LogisticRegression(),
    'KNN':KNeighborsClassifier(),
    'DT':DecisionTreeClassifier(),
    'SVC':SVC(),
    'NB':GaussianNB(),
    'XGC':XGBClassifier(),
    'RF':RandomForestClassifier()
    
}

In [180]:
for name,model in  models.items():
    print(f'using {name}: ')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Training Accuracy :{accuracy_score(y_train,model.predict(x_train))}')
    print(f'Testing Accuracy :{accuracy_score(y_test,y_pred)}')
    print(f'Confusion matrix:\n {confusion_matrix(y_test,y_pred)}')
    print(f'Recall: {recall_score(y_test,y_pred)}')
    print(f'precision: {precision_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print(f'Fbeta-score: {fbeta_score(y_test,y_pred,beta=0.5)}')
    print(classification_report(y_test,y_pred))
    print('-'*33)
    

using LR: 
Training Accuracy :1.0
Testing Accuracy :0.7982062780269058
Confusion matrix:
 [[118  22]
 [ 23  60]]
Recall: 0.7228915662650602
precision: 0.7317073170731707
F1-score: 0.7272727272727273
Fbeta-score: 0.72992700729927
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       140
           1       0.73      0.72      0.73        83

    accuracy                           0.80       223
   macro avg       0.78      0.78      0.78       223
weighted avg       0.80      0.80      0.80       223

---------------------------------
using KNN: 
Training Accuracy :0.7859281437125748
Testing Accuracy :0.6860986547085202
Confusion matrix:
 [[137   3]
 [ 67  16]]
Recall: 0.1927710843373494
precision: 0.8421052631578947
F1-score: 0.3137254901960785
Fbeta-score: 0.5031446540880503
              precision    recall  f1-score   support

           0       0.67      0.98      0.80       140
           1       0.84      0.19      0.31        83




Training Accuracy :1.0
Testing Accuracy :0.820627802690583
Confusion matrix:
 [[125  15]
 [ 25  58]]
Recall: 0.6987951807228916
precision: 0.7945205479452054
F1-score: 0.7435897435897436
Fbeta-score: 0.7733333333333333
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       140
           1       0.79      0.70      0.74        83

    accuracy                           0.82       223
   macro avg       0.81      0.80      0.80       223
weighted avg       0.82      0.82      0.82       223

---------------------------------
using RF: 
Training Accuracy :1.0
Testing Accuracy :0.8251121076233184
Confusion matrix:
 [[122  18]
 [ 21  62]]
Recall: 0.7469879518072289
precision: 0.775
F1-score: 0.7607361963190183
Fbeta-score: 0.7692307692307693
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       140
           1       0.78      0.75      0.76        83

    accuracy                          

### Best model for this data is SVC