# Import Libraries



In [1]:
import pandas as pd, numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Dataset

In [3]:
path = '/content/gdrive/MyDrive/DigitalSkola/Dataset/titanic.csv'

data = pd.read_csv(path)
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


# Data Quality Checking

In [7]:
# Data Shape
data.shape

(891, 12)

In [11]:
# data describe
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [33]:
# Missing Value Check
print(data.isna().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [25]:
# Duplicate check
print(data.duplicated().sum())

0


In [26]:
# Remove duplicate data
data = data.drop_duplicates()

In [15]:
# data imputation
age_mean = data.Age.mean()
data.loc[data['Age'].isnull(), 'Age'] = age_mean

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
emb_mode = data.Embarked.mode()
data['Embarked'] = data['Embarked'].fillna('S')

In [32]:
data = data.drop('Cabin', axis=1)

# One hot Encode and Label Encode

In [34]:
# One Hot Encode
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1


In [36]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data['Pclass'] = label.fit_transform(data['Pclass'])

In [38]:
data = data.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,2,22.0,1,0,7.25,1,0,1
1,1,0,38.0,1,0,71.2833,0,0,0
2,1,2,26.0,0,0,7.925,0,0,1
3,1,0,35.0,1,0,53.1,0,0,1
4,0,2,35.0,0,0,8.05,1,0,1


# Train Test Split

In [39]:
from sklearn.model_selection import train_test_split
X = data.drop(['Survived'], axis=1)
y = data['Survived']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Standardisasi dan Normalisasi

In [47]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [49]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.84105,-0.200197,-0.449916,2.101396,-0.416075,-1.39361,-0.306108,0.609811
1,-0.366264,1.694513,-0.449916,-0.4562,-0.388502,0.717561,-0.306108,0.609811
2,-0.366264,-0.200197,-0.449916,-0.4562,-0.379091,0.717561,-0.306108,0.609811
3,0.84105,0.036642,-0.449916,-0.4562,-0.493013,0.717561,-0.306108,0.609811
4,0.84105,0.012888,6.872147,2.101396,0.741305,0.717561,-0.306108,0.609811


# Imbalanced

In [50]:
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
X_over, y_over = undersample.fit_resample(X_train, y_train)

