In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
test_df = pd.read_csv('./dataset/test.csv')
copy_test_df = test_df.copy()

In [None]:
copy_test_df.isnull().sum()

In [None]:
copy_test_df = copy_test_df.dropna(subset = ['Fare'])

In [None]:
train_data = copy_test_df[copy_test_df['Age'].notna()]
test_data = copy_test_df[copy_test_df['Age'].isna()]

X_train = train_data[['Pclass', 'Sex', 'Fare', 'Embarked']]
y_train = train_data['Age']

X_test = test_data[['Pclass', 'Sex', 'Fare', 'Embarked']]

# convert categorical columns into numeric (e.g using pd.get_dummies)
X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

model = LinearRegression()
model.fit(X_train, y_train)

predicted_ages = model.predict(X_test)

copy_test_df.loc[copy_test_df['Age'].isna(), 'Age'] = predicted_ages

In [None]:
# Using regression model to replace missing values of Cabin column
copy_test_df['Cabin'] = copy_test_df['Cabin'].fillna('Unknown')

dummy_train_df = copy_test_df.copy()

dummy_train_df['CabinLetter'] = dummy_train_df['Cabin'].str[0]

dummy_train_df = pd.get_dummies(dummy_train_df, columns = ['CabinLetter', 'Sex', 'Embarked'], drop_first=True)

X_train_cabin = dummy_train_df[dummy_train_df['Cabin'] != 'Unknown'][['Pclass', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S','CabinLetter_B', 'CabinLetter_C', 'CabinLetter_D','CabinLetter_E','CabinLetter_F','CabinLetter_G','CabinLetter_U']]
y_train_cabin = dummy_train_df[dummy_train_df['Cabin'] != 'Unknown']['Cabin']

X_test_cabin = dummy_train_df[dummy_train_df['Cabin'] == 'Unknown'][['Pclass', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S','CabinLetter_B', 'CabinLetter_C', 'CabinLetter_D','CabinLetter_E','CabinLetter_F','CabinLetter_G','CabinLetter_U']]

cabin_model = RandomForestClassifier()
cabin_model.fit(X_train_cabin, y_train_cabin)

predicted_cabin = cabin_model.predict(X_test_cabin)

copy_test_df.loc[copy_test_df['Cabin'] == 'Unknown', 'Cabin'] = predicted_cabin

In [None]:
copy_test_df['FamilySize'] = copy_test_df['SibSp'] + copy_test_df['Parch']
copy_test_df['FarePerPerson'] = copy_test_df['Fare'] / (copy_test_df['FamilySize'] + 1)
copy_test_df['IsAlone'] = (copy_test_df['FamilySize'] == 0).astype(int)
copy_test_df['Title'] = copy_test_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand = False)
copy_test_df['AgeGroup'] = pd.cut(copy_test_df['Age'], bins = [0,12,18,60,100], labels = ['Child', 'Teen', 'Adult', 'Senior'])
copy_test_df['Deck'] = copy_test_df['Cabin'].str[0]
copy_test_df['FareGroup'] = pd.qcut(copy_test_df['Fare'], 4, labels = ['Low', 'Medium', 'High', 'Very High'])
tickets_count = copy_test_df['Ticket'].value_counts()
copy_test_df['TickerFrequency'] = copy_test_df['Ticket'].map(tickets_count)
copy_test_df['Pclass_Age'] = copy_test_df['Pclass'] * copy_test_df['Age']
copy_test_df['Embarked_Pclass'] = copy_test_df['Embarked'].astype(str) + '_' + copy_test_df['Pclass'].astype(str)

In [None]:
columns_to_drop = ['Name', 'Ticket', 'Cabin', 
                   'Fare', 'Parch', 'SibSp']
copy_test_df = copy_test_df.drop(columns = columns_to_drop)

In [None]:
le = LabelEncoder()
copy_test_df['Sex'] = le.fit_transform(copy_test_df['Sex'])

In [None]:
copy_test_df = pd.get_dummies(copy_test_df, columns = ['AgeGroup','FareGroup','Embarked','Title','Deck','Embarked_Pclass'], drop_first=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns_to_normalize = ['Pclass', 'Age', 'FamilySize', 'FarePerPerson', 'IsAlone','TickerFrequency','Pclass_Age']
copy_test_df[columns_to_normalize] = scaler.fit_transform(copy_test_df[columns_to_normalize])

In [None]:
copy_test_df.head()

In [None]:
X_test = copy_test_df.drop(columns=['PassengerId'])
model = joblib.load('decision_tree_model.pkl')
y_pred = model.predict(X_test)
output = pd.DataFrame({'PassengerId':copy_test_df['PassengerId'], 'Survivied':y_pred})

In [None]:
copy_test_df.columns