<a href="https://colab.research.google.com/github/127-0-0-vvk/LogisticRegressionTitanic/blob/main/LogisticRegressionTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# make sure you download the file Titanic-train.csv to your google
# drive. This code should be able to access it if it is there! Place the file at
# the top level directory of google drive
from google.colab import drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Make sure file is listed - this code should show it 
listed = drive.ListFile({'q': "title contains 'titanic_train.csv'"}).GetList()
print ("fetching csv files...")
for file in listed:
  print('title {}, id {}'.format(file['title'], file['id']))

%pwd
print('listing content')
%ls /content/drive


Mounted at /content/drive
fetching csv files...
listing content
[0m[01;34mMyDrive[0m/


In [None]:

import pandas as pd
import numpy as np
from numpy import loadtxt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Import the data set
titanic_data = pd.read_csv('/content/titanic_train.csv')
#
#Exploratory data analysis
sns.heatmap(titanic_data.isnull(), cbar=False)
sns.countplot(x='Survived', data=titanic_data)
sns.countplot(x='Survived', hue='Sex', data=titanic_data)
sns.countplot(x='Survived', hue='Pclass', data=titanic_data)
plt.hist(titanic_data['Age'].dropna())
plt.hist(titanic_data['Fare'])
sns.boxplot(titanic_data['Pclass'], titanic_data['Age'])

#Imputation function
def impute_missing_age(columns):
    age = columns[0]
    passenger_class = columns[1]
    
    if pd.isnull(age):
        if(passenger_class == 1):
            return titanic_data[titanic_data['Pclass'] == 1]['Age'].mean()
        elif(passenger_class == 2):
            return titanic_data[titanic_data['Pclass'] == 2]['Age'].mean()
        elif(passenger_class == 3):
            return titanic_data[titanic_data['Pclass'] == 3]['Age'].mean()
        
    else:
        return age

#Impute the missing Age data
titanic_data['Age'] = titanic_data[['Age', 'Pclass']].apply(impute_missing_age, axis = 1)

#Reinvestigate missing data
sns.heatmap(titanic_data.isnull(), cbar=False)

#Drop null data
titanic_data.drop('Cabin', axis=1, inplace = True)
titanic_data.dropna(inplace = True)

#Create dummy variables for Sex and Embarked columns
sex_data = pd.get_dummies(titanic_data['Sex'], drop_first = True)
embarked_data = pd.get_dummies(titanic_data['Embarked'], drop_first = True)

#Add dummy variables to the DataFrame and drop non-numeric data
titanic_data = pd.concat([titanic_data, sex_data, embarked_data], axis = 1)
titanic_data.drop(['Name', 'PassengerId', 'Ticket', 'Sex', 'Embarked'], axis = 1, inplace = True)

#Print the finalized data set
print(titanic_data.head())

#Split the data set into x and y data
y_data = titanic_data['Survived']
x_data = titanic_data.drop('Survived', axis = 1)

#Split the data set into training data and test data
from sklearn.model_selection import train_test_split
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.3)

#Create the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)

#Train the model and create predictions
model.fit(x_training_data, y_training_data)
predictions = model.predict(x_test_data)

#Calculate performance metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_data, predictions))

#Generate a confusion matrix
from sklearn.metrics import confusion_matrix
print('*******confusion matrix below*******')
print(confusion_matrix(y_test_data, predictions))