In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train=pd.read_csv('train.csv')

In [None]:
train.head()

# Missing Data

In [None]:
train.isnull()

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Roughly 20 percent of the Age data is missing. The proportion of Age missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later, or change it to another feature like "Cabin Known: 1 or 0"

Let's continue on by visualizing some more of the data! Check out the video for full explanations over these plots, this code is just to serve as reference.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=train)

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train,palette='RdBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train,palette='rainbow')

In [None]:
sns.displot(train['Age'].dropna(),kde=False,color='darkred',bins=40)

In [None]:
sns.displot(train['Age'].dropna(),kde=False,color='darkred',alpha=0.3)

In [None]:
sns.countplot(x='SibSp',data=train)

In [None]:
train['Fare'].hist(color='g',bins=40,figsize=(8,4))

# Data Cleaning

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='Pclass',y='Age',data=train)

In [None]:
def impute_age(cols):
    Age=cols[0]
    Pclass=cols[1]

    if pd.isnull(Age):

        if Pclass==1:
            return 37

        elif Pclass==2:
            return 29
        
        else:
            return 24
    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
train.dropna(inplace=True)

In [None]:
train.head()

# Converting Categorical Features

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [None]:
train.info()

In [None]:
pd.get_dummies(train['Embarked'],drop_first=True).head()

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [None]:
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train.head()

# Building a Logistic Regression mode

Let's start by splitting our data into a training set and test set (there is another test.csv file that you can play around with in case you want to use all this data for training)

# Train Test Split

In [None]:
train.drop('Survived',axis=1).head()

In [None]:
train['Survived'].head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',axis=1), 
                                                    train['Survived'], test_size=0.30, 
                                                    random_state=101)

# Training and Predicting

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)