# Assignment

In [70]:
# Used libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score,\
                            f1_score, fbeta_score, classification_report

## 1) Social Network Ads dataset

In [3]:
sn_df=pd.read_csv('Assignment Datasets/Social_Network_Ads.csv')
sn_df.sample(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
31,15729054,Female,27,137000,1
327,15785170,Female,42,75000,0
10,15570769,Female,26,80000,0
335,15601550,Female,36,54000,0
107,15789863,Male,27,89000,0


In [4]:
sn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
sn_df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [8]:
sn_df=pd.get_dummies(sn_df, columns=['Gender'], drop_first=True)
sn_df.sample(3)

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
48,15727696,30,135000,1,1
163,15757837,35,38000,0,1
145,15746422,24,89000,0,0


In [20]:
# Split on independent and dependent data
X=sn_df.drop(columns=['User ID', 'Purchased'])
y=sn_df.Purchased

# Split on Train and Test parts
X_train, X_test, y_train, y_test=train_test_split(X, y)

y_train.value_counts() # Check on data balance

0    202
1     98
Name: Purchased, dtype: int64

Train data samples are not balanced

In [22]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
y_train.value_counts() # Check

1    202
0    202
Name: Purchased, dtype: int64

In [23]:
# Scaling data for better processing in ML models
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [33]:
models={
    'LR':LogisticRegression(),
    'KNN':KNeighborsClassifier(),
    'NB':GaussianNB(),
    'SVM':SVC(),
    'DT':DecisionTreeClassifier(),
    'RF':RandomForestClassifier(),
    'XGB':XGBClassifier()
}

In [34]:
for name, model in models.items():
    print(f'Using {name}')
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    print(f'Training Accuracy: {accuracy_score(y_train, model.predict(X_train))}')
    print(f'Testing Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'F1 score: {f1_score(y_test, y_pred)}')
    print(f'FBeta score: {fbeta_score(y_test, y_pred, beta=0.5)}')
    print(f'Classification Report: {classification_report(y_test, y_pred)}')
    print('--'*30)

Using LR
Training Accuracy: 0.8292079207920792
Testing Accuracy: 0.83
Confusion Matrix: [[43 12]
 [ 5 40]]
Recall: 0.8888888888888888
Precision: 0.7692307692307693
F1 score: 0.8247422680412372
FBeta score: 0.7905138339920948
Classification Report:               precision    recall  f1-score   support

           0       0.90      0.78      0.83        55
           1       0.77      0.89      0.82        45

    accuracy                           0.83       100
   macro avg       0.83      0.84      0.83       100
weighted avg       0.84      0.83      0.83       100

------------------------------------------------------------
Using KNN
Training Accuracy: 0.943069306930693
Testing Accuracy: 0.86
Confusion Matrix: [[48  7]
 [ 7 38]]
Recall: 0.8444444444444444
Precision: 0.8444444444444444
F1 score: 0.8444444444444444
FBeta score: 0.8444444444444444
Classification Report:               precision    recall  f1-score   support

           0       0.87      0.87      0.87        55
       



From Above results, it seems that **SVM** model has the highest accuracy with greater scores

In [36]:
model=SVC()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

___

## 2) Titanic dataset

In [53]:
tnc_df=pd.read_csv('Assignment Datasets/Titanic.csv')
tnc_df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
435,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S
809,810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33.0,1,0,113806,53.1,E8,S
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
452,453,0,1,"Foreman, Mr. Benjamin Laventall",male,30.0,0,0,113051,27.75,C111,C


In [38]:
tnc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [54]:
# Filling NaNs
tnc_df.Age.fillna(tnc_df.Age.mean(), inplace=True)
imp=SimpleImputer(strategy='most_frequent')
tnc_df.Embarked=imp.fit_transform(tnc_df[['Embarked']])

# Classifying categorical data
tnc_df=pd.get_dummies(tnc_df, columns=['Sex', 'Embarked'], drop_first=True)

In [55]:
# Split on independent and dependent data
X=tnc_df.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])
y=tnc_df.Survived

# Split on Train and Test parts
X_train, X_test, y_train, y_test=train_test_split(X, y)

y_train.value_counts() # Check on data balance

0    409
1    259
Name: Survived, dtype: int64

In [56]:
# Scaling data for better processing in ML models
# [scaler] is defined above in Q1
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [60]:
# [models] is defined above in Q1
for name, model in models.items():
    print(f'Using {name}')
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    print(f'Training Accuracy: {accuracy_score(y_train, model.predict(X_train))}')
    print(f'Testing Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'F1 score: {f1_score(y_test, y_pred)}')
    print(f'FBeta score: {fbeta_score(y_test, y_pred, beta=0.5)}')
    print(f'Classification Report: {classification_report(y_test, y_pred)}')
    print('--'*30)

Using LR
Training Accuracy: 0.7919161676646707
Testing Accuracy: 0.8340807174887892
Confusion Matrix: [[119  21]
 [ 16  67]]
Recall: 0.8072289156626506
Precision: 0.7613636363636364
F1 score: 0.783625730994152
FBeta score: 0.7701149425287356
Classification Report:               precision    recall  f1-score   support

           0       0.88      0.85      0.87       140
           1       0.76      0.81      0.78        83

    accuracy                           0.83       223
   macro avg       0.82      0.83      0.82       223
weighted avg       0.84      0.83      0.83       223

------------------------------------------------------------
Using KNN
Training Accuracy: 0.8532934131736527
Testing Accuracy: 0.852017937219731
Confusion Matrix: [[121  19]
 [ 14  69]]
Recall: 0.8313253012048193
Precision: 0.7840909090909091
F1 score: 0.8070175438596491
FBeta score: 0.7931034482758621
Classification Report:               precision    recall  f1-score   support

           0       0.90   



Training Accuracy: 0.9760479041916168
Testing Accuracy: 0.8026905829596412
Confusion Matrix: [[118  22]
 [ 22  61]]
Recall: 0.7349397590361446
Precision: 0.7349397590361446
F1 score: 0.7349397590361445
FBeta score: 0.7349397590361446
Classification Report:               precision    recall  f1-score   support

           0       0.84      0.84      0.84       140
           1       0.73      0.73      0.73        83

    accuracy                           0.80       223
   macro avg       0.79      0.79      0.79       223
weighted avg       0.80      0.80      0.80       223

------------------------------------------------------------


From Above results, it seems that **KNN** model has the highest accuracy with greater scores

In [61]:
model=KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

- Using Cross Validation

In [74]:
for name, model in models.items():
    print(f'Using {name}')
    scores=cross_validate(model, X, y, cv=6, n_jobs=-1, return_train_score=True)
    print("['test_score', 'train_score']")
    print([scores.get(scoring).mean() for scoring in ['test_score', 'train_score']])
    print('--'*30)

Using LR
['test_score', 'train_score']
[0.7946142451176007, 0.8026930597526599]
------------------------------------------------------------
Using KNN
['test_score', 'train_score']
[0.694789588245964, 0.8033690303872381]
------------------------------------------------------------
Using NB
['test_score', 'train_score']
[0.7856581413628394, 0.7950608796808548]
------------------------------------------------------------
Using SVM
['test_score', 'train_score']
[0.6734460971038153, 0.6803641413903229]
------------------------------------------------------------
Using DT
['test_score', 'train_score']
[0.7789542898603301, 0.9840635992836405]
------------------------------------------------------------
Using RF
['test_score', 'train_score']
[0.8148316101336235, 0.9840635992836405]
------------------------------------------------------------
Using XGB
['test_score', 'train_score']
[0.8092463268637765, 0.9699232368231073]
------------------------------------------------------------


From Above results, it seems that **Random Forest Classifier** model has the highest accuracy score.