# Importing the necessary Libraries

In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Uploading Training and Test Dataset


In [198]:
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [199]:
train_data.head(5)

In [200]:
test_data.head(5)

# Exploring the Training Data 

## Correlation Between Features

In [201]:
corr = train_data.corr()

corr.style.background_gradient(cmap='coolwarm')

## Information About the Training Dataset

In [202]:
train_data.info()

## Unique Values in the Training Dataset

In [203]:
train_data.nunique()

## Checking for missing Values in Training Data

In [204]:
train_data.isnull().sum()

## Handling Missing Values in Training Data

In [205]:
#creating a flag variable for Cabin
train_data['Cabin_ind'] = 1
train_data['Cabin_ind'][train_data['Cabin'].isna()] = 0

In [206]:
train_data['Cabin_ind'].value_counts()

In [207]:
train_data.drop('Cabin',axis = 1,inplace = True)

In [208]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age']).median()

In [209]:
print(train_data['Age'])

In [210]:
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked']).mode()[0]

## Checking for Outliers in Training Data

### Target Variable

In [211]:
sns.countplot(y = 'Survived', data = train_data)
train_data['Survived'].value_counts()


### Pclass

In [212]:
sns.countplot(y = 'Pclass', data = train_data)
train_data['Pclass'].value_counts()


In [213]:
# Survival rate of each Pclass
sns.countplot(y='Pclass',hue='Survived',data=train_data)

### Sex

In [214]:
# Label Encoding Sex Feature because it has Categorical values
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0}).astype(int)
sns.countplot(y = 'Sex', data = train_data)
train_data['Sex'].value_counts()


In [215]:
# Survival rate of each gender
sns.countplot(y='Sex',hue='Survived',data=train_data)

### SibSP

In [216]:
sns.countplot(y = 'SibSp', data = train_data)

In [217]:
sns.countplot(y = 'SibSp',hue = 'Survived', data = train_data)

Survival rate of a person who has 1 siblings/Spouses is greater than others

### Parch

In [218]:
sns.countplot(y = 'Parch', data = train_data)

In [219]:
sns.countplot(y = 'Parch',hue = 'Survived', data = train_data)

### Fare

In [220]:
sns.boxplot(train_data['Fare'])
plt.title("Box Plot before mean imputation")
plt.show()
q1 = train_data['Fare'].quantile(0.25)
q3 = train_data['Fare'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr
m = np.mean(train_data['Fare'])
for i in train_data['Fare']:
    if i > Upper_tail or i < Lower_tail:
            train_data['Fare'] = train_data['Fare'].replace(i, m)
sns.boxplot(train_data['Fare'])
plt.title("Box Plot after mean imputation")
plt.show()   

In [221]:
# From above Exploration of Training Dataset Features 
# Removing the unnecessary Columns
train_data.drop('PassengerId', axis = 1, inplace = True)
train_data.drop('Name', axis = 1, inplace = True)
train_data.drop('Ticket', axis = 1, inplace = True)
train_data.drop('Embarked', axis = 1, inplace = True)
train_data.drop('Age', axis = 1, inplace = True)

# Exploring the Training Data 

## Correlation between Feature for Test Data

In [222]:
corr = test_data.corr()

corr.style.background_gradient(cmap='coolwarm')

## Information about the Test Data

In [223]:
test_data.info()

## unique Values in Features of Test Data

In [224]:
test_data.nunique()

## Checking for missing Values in Test Data

In [225]:
test_data.isnull().sum()

In [226]:
## Handling Missing Values in Test Data

In [227]:
#creating a flag variable for Cabin
test_data['Cabin_ind'] = 1
test_data['Cabin_ind'][test_data['Cabin'].isna()] = 0

In [228]:
test_data['Cabin_ind'].value_counts()

In [229]:
test_data.drop('Cabin',axis = 1,inplace = True)

In [230]:
test_data['Age'] = test_data['Age'].fillna(test_data['Age']).mean()

In [231]:
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare']).mean()

## Checking for Outliers in Test Data

## Pclass

In [232]:
sns.countplot(y = 'Pclass', data = test_data)
test_data['Pclass'].value_counts()

In [233]:
# Label Encoding Sex Feature because it has Categorical values
test_data['Sex'] = test_data['Sex'].map({'male': 1, 'female': 0}).astype(int)
sns.countplot(y = 'Sex', data = test_data)
test_data['Sex'].value_counts()

## Fare

In [234]:
sns.boxplot(test_data['Fare'])
plt.title("Box Plot before mean imputation")
plt.show()
q1 = test_data['Fare'].quantile(0.25)
q3 = test_data['Fare'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr
m = np.mean(test_data['Fare'])
for i in test_data['Fare']:
    if i > Upper_tail or i < Lower_tail:
            test_data['Fare'] = test_data['Fare'].replace(i, m)
sns.boxplot(test_data['Fare'])
plt.title("Box Plot after mean imputation")
plt.show()   

In [235]:
test_data.head(5)

In [236]:
test_data.drop('PassengerId', axis = 1, inplace = True)
test_data.drop('Name', axis = 1, inplace = True)
test_data.drop('Ticket', axis = 1, inplace = True)
test_data.drop('Embarked', axis = 1, inplace = True)
test_data.drop('Age', axis = 1, inplace = True)

# Preparing Data for Training

In [237]:
# separating the feature and target columns

X_train = train_data.drop('Survived', axis = 1)
y_train = train_data['Survived']

In [238]:
X_test = test_data

In [239]:
X_train.head(5)

In [240]:
X_test.head(5)

# First Model

In [241]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [242]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

log_train_acc = accuracy_score(y_train, logreg.predict(X_train))

print(f"Training accuracy of Logistic Regression Classifier is : {log_train_acc}")


In [243]:
# for Test Dataset
# print(y_pred)
# sns.countplot(y_pred)

# Checking for Multi-collinearty

In [244]:
import statsmodels.formula.api as sm

def vif_cal(input_data, dependent_col):
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

In [245]:
  vif_cal(input_data=train_data, dependent_col='Survived')

No multinearty as all the VIF are less than 5

# Checking P-values

In [246]:
import statsmodels.api as sm 
Logit_model = sm.Logit(y_train,X_train)  
result = Logit_model.fit()
print(result.summary2()) # (p>|z| should be less than 0.1) ....(summary2 to use for summary for all data)

Dropping columns which have p-values less than 0.05

In [247]:
X_train.drop('Parch', axis = 1, inplace = True)
X_train.drop('Fare', axis = 1, inplace = True)

In [248]:
X_test.drop('Parch', axis = 1, inplace = True)
X_test.drop('Fare', axis = 1, inplace = True)

# Final Model

In [249]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [250]:
# accuracy_score, confusion_matrix and classification_report

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

log_train_acc = accuracy_score(y_train, logreg.predict(X_train))

print(f"Training accuracy of Logistic Regression Classifier is : {log_train_acc}")


In [251]:
# for Test Dataset
# print(y_pred)
# sns.countplot(y_pred)