## Import Necessary Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler


## Import Data to dataframe
Since they were already split for us no need to use train_test_split from sckit learn.

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

## Creating X and Y training values

In [3]:
train_x = train[["PassengerId","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]]
train_y= train[["Survived"]]

## Cleaning the dataset
1. Removing null values and removing columns which have more than 55% null values.
2. Also dropping irrelevant columns in the dataset.

In [4]:
# Below code gives percentage of null in every column
null_percentage = train_x.isnull().sum()/train_x.shape[0]*100

# Below code gives list of columns having more than 55% null
col_to_drop = null_percentage[null_percentage>55].keys().tolist()

# Adding Name and Ticket because they are irrelevant.
col_to_drop.extend(["Name", "Ticket"])

train_x = train_x.drop(col_to_drop, axis=1)
test_x = test.drop(col_to_drop,axis=1)

# Converting strings into classes represented by numbers
le_x = LabelEncoder()
train_x['Sex'] = le_x.fit_transform(train_x['Sex'])
train_x["Embarked"] = le_x.fit_transform(train_x["Embarked"])
test_x['Sex'] = le_x.fit_transform(test_x['Sex'])
test_x["Embarked"] = le_x.fit_transform(test_x["Embarked"])



## Filling null values

In [5]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(test_x.mean(),inplace=True)

## Scaling the data

In [8]:
std_x = StandardScaler()
train_x = std_x.fit_transform(train_x)
test_x = std_x.fit_transform(test_x)