# Logistic Regression

In [50]:
#1 Importing required Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [51]:
#2 Read the hataset onto a variable
train = pd.read_csv("titanic_data.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
#3 Fix the Predictor Variables

df = train[['Survived','Pclass','Sex','Age','Fare']]

In [53]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [54]:
print("Age:",df['Age'].isnull().sum())
print("Survived:",df['Survived'].isnull().sum())
print("Pclass:",df['Pclass'].isnull().sum())
print("Fare:",df['Fare'].isnull().sum())

Age: 177
Survived: 0
Pclass: 0
Fare: 0


In [55]:
#4 categorical data encoding male ->1 and female->0
df["Sex"] = df ["Sex"].apply(lambda sex:1 if sex=='male' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:
#5 Handling missing values - Data Imputation
#fill all the missing values with the median of age
df["Age"] = df["Age"].fillna(df["Age"].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [57]:
#6 takig a look of the dataframe
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(3)
memory usage: 34.9 KB


In [59]:
print("Age:",df['Age'].isnull().sum())
print("Survived:",df['Survived'].isnull().sum())
print("Pclass:",df['Pclass'].isnull().sum())
print("Fare:",df['Fare'].isnull().sum())

Age: 0
Survived: 0
Pclass: 0
Fare: 0


In [60]:
#7 set the Predictor(X) and Response(Y) Variables
X = df.drop("Survived", axis =1)
Y = df["Survived"]

In [61]:
#8splitting into training and test set
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 25)

In [62]:
#9 call the Logistic Regression model
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
#10 compute Predictions or Y hat
Y_pred = logit.predict(X_test)

In [64]:
#11 cconfusion matrix
from sklearn.metrics import confusion_matrix
c = confusion_matrix(Y_test,Y_pred)
c

array([[136,  29],
       [ 31,  72]], dtype=int64)

In [65]:
#12 accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.7761194029850746

In [66]:
#13 Classification Report
from sklearn.metrics import classification_report
report = classification_report(Y_test, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       165
           1       0.71      0.70      0.71       103

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268



# Implementing the Given task
Naive Bayes 

In [72]:
#14 importing Naive Bayes module
from sklearn.naive_bayes import GaussianNB

In [73]:
#15 Handler addition
mdl = GaussianNB()

In [74]:
#16 model fitting
mdl.fit(X_train,Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [75]:
#17 Prediction
Y_pred_naive_bayes = mdl.predict(X_test)

In [76]:
Y_pred_naive_bayes

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1], dtype=int64)

In [77]:
#18 confusion matrix
cnf_mtrx_naive_bayes = confusion_matrix(Y_test,Y_pred_naive_bayes)

In [78]:
#19 display of confusion matrix
cnf_mtrx_naive_bayes

array([[130,  35],
       [ 27,  76]], dtype=int64)

In [79]:
#20 accuracy score of Naive bayes
accuracy_score(Y_test,Y_pred_naive_bayes)

0.7686567164179104

In [80]:
#21 classification report of Naive Bayes
report_naive_bayes = classification_report(Y_test, Y_pred_naive_bayes)
print(report_naive_bayes)

              precision    recall  f1-score   support

           0       0.83      0.79      0.81       165
           1       0.68      0.74      0.71       103

    accuracy                           0.77       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.77      0.77      0.77       268



# Conclusion: 
Logistic regression produced better output than Naive Bayes, the difference between the two output seem insignificant for this application. In terms of numbers Logistic regression produced best accuracy in this experiment.