## Imports

In [52]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

## load data

In [62]:
raw_train_data = pd.read_csv('./../../dataset/titanic/train.csv')
raw_test_data = pd.read_csv('./../../dataset/titanic/test.csv')
original_test_data = pd.read_csv('./../../dataset/titanic/test.csv')

In [96]:
print(raw_train_data.shape)
print(raw_test_data.shape)

(891, 12)
(418, 11)


In [31]:
print(raw_train_data.info())
print('-'*30)
print(raw_test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null obje

In [32]:
raw_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
raw_test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# The 4C's Correcting, Completing, Creating and Converting

### Correcting: delete unreasonable values or useless columns

In [34]:
delete_columns = ['PassengerId', 'Ticket', 'Cabin']
raw_train_data.drop(delete_columns, axis=1, inplace=True)
raw_test_data.drop(delete_columns, axis=1, inplace=True)

# The 4C's Correcting, Completing, Creating and Converting

### Correcting: delete unreasonable values or useless columns

In [62]:
raw_train_data = pd.read_csv('./../../dataset/titanic/train.csv')
raw_test_data = pd.read_csv('./../../dataset/titanic/test.csv')
original_test_data = pd.read_csv('./../../dataset/titanic/test.csv')

### Completing: filling null values

In [35]:
print(raw_train_data.isnull().sum())
print('-'*30)
print(raw_test_data.isnull().sum())

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
------------------------------
Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [44]:
# Null Age values filled with the median value 
raw_train_data['Age'].fillna(raw_train_data['Age'].median(), inplace=True)
raw_test_data['Age'].fillna(raw_train_data['Age'].median(), inplace=True)

# Null Fare values filled with the median value
raw_train_data['Fare'].fillna(raw_train_data['Fare'].median(), inplace=True)
raw_test_data['Fare'].fillna(raw_train_data['Fare'].median(), inplace=True)



# Null Embarked values filled with the mode
raw_train_data['Embarked'].fillna(raw_train_data['Embarked'].mode()[0], inplace=True)
raw_test_data['Embarked'].fillna(raw_train_data['Embarked'].mode()[0], inplace=True)

### Creating: Feature creation

- **Title:** extract the title from the name column
- **Bins:** create bins for features like Age, or Fare
- **FamilySize:** use columns like **SibSp** and **Parch** to know the number of family members
- **IsAlone:** if the passenger had any family member aboard

In [45]:
raw_train_data['Title'] = raw_train_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_filter = (raw_train_data['Title'].value_counts() < 10)
raw_train_data['Title'] = raw_train_data['Title'].apply(lambda x: 'Misc' if title_filter.loc[x] == True else x)

raw_train_data['AgeBin'] = pd.cut(raw_train_data['Age'], 5)
raw_train_data['FareBin'] = pd.qcut(raw_train_data['Fare'], 4)

raw_train_data['FamilySize'] = raw_train_data['SibSp'] + raw_train_data['Parch'] + 1

raw_train_data['IsAlone'] = 1
raw_train_data['IsAlone'].loc[raw_train_data['FamilySize'] > 1] = 0

train_data = raw_train_data.drop(['Name', 'Age', 'Fare'], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeBin,FareBin,FamilySize,IsAlone
0,0,3,male,1,0,S,Mr,"(16.336, 32.252]","(-0.001, 7.91]",2,0
1,1,1,female,1,0,C,Mrs,"(32.252, 48.168]","(31.0, 512.329]",2,0
2,1,3,female,0,0,S,Miss,"(16.336, 32.252]","(7.91, 14.454]",1,1
3,1,1,female,1,0,S,Mrs,"(32.252, 48.168]","(31.0, 512.329]",2,0
4,0,3,male,0,0,S,Mr,"(32.252, 48.168]","(7.91, 14.454]",1,1


In [46]:
raw_test_data['Title'] = raw_test_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_filter = (raw_test_data['Title'].value_counts() < 10)
raw_test_data['Title'] = raw_test_data['Title'].apply(lambda x: 'Misc' if title_filter.loc[x] == True else x)

raw_test_data['AgeBin'] = pd.cut(raw_test_data['Age'], 5)
raw_test_data['FareBin'] = pd.qcut(raw_test_data['Fare'], 4)

raw_test_data['FamilySize'] = raw_test_data['SibSp'] + raw_test_data['Parch'] + 1

raw_test_data['IsAlone'] = 1
raw_test_data['IsAlone'].loc[raw_test_data['FamilySize'] > 1] = 0

test_data = raw_test_data.drop(['Name', 'Age', 'Fare'], axis=1)
test_data.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeBin,FareBin,FamilySize,IsAlone
0,3,male,0,0,Q,Mr,"(30.502, 45.668]","(-0.001, 7.896]",1,1
1,3,female,1,0,S,Mrs,"(45.668, 60.834]","(-0.001, 7.896]",2,0
2,2,male,0,0,Q,Mr,"(60.834, 76.0]","(7.896, 14.454]",1,1
3,3,male,0,0,S,Mr,"(15.336, 30.502]","(7.896, 14.454]",1,1
4,3,female,1,1,S,Mrs,"(15.336, 30.502]","(7.896, 14.454]",3,0


In [41]:
print(train_data.isnull().sum())

Survived      0
Pclass        0
Sex           0
SibSp         0
Parch         0
Embarked      2
Title         0
AgeBin        0
FareBin       0
FamilySize    0
IsAlone       0
Sex_Code      0
dtype: int64


### Converting: Creating Dummy/Encoded Variables

In [49]:
encoder = LabelEncoder()
train_data['Sex_Code'] = encoder.fit_transform(train_data['Sex']) 
train_data['Embarked_Code'] = encoder.fit_transform(train_data['Embarked']) 
train_data['AgeBin_Code'] = encoder.fit_transform(train_data['AgeBin']) 
train_data['FareBin_Code'] = encoder.fit_transform(train_data['FareBin'])
train_data['Title_Code'] = encoder.fit_transform(train_data['Title'])

train_data.drop(['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,FamilySize,IsAlone,Sex_Code,Embarked_Code,AgeBin_Code,FareBin_Code,Title_Code
0,0,3,1,0,2,0,1,2,1,0,3
1,1,1,1,0,2,0,0,0,2,3,4
2,1,3,0,0,1,1,0,2,1,1,2
3,1,1,1,0,2,0,0,2,2,3,4
4,0,3,0,0,1,1,1,2,2,1,3


In [50]:
encoder2 = LabelEncoder()
test_data['Sex_Code'] = encoder2.fit_transform(test_data['Sex']) 
test_data['Embarked_Code'] = encoder2.fit_transform(test_data['Embarked']) 
test_data['AgeBin_Code'] = encoder2.fit_transform(test_data['AgeBin']) 
test_data['FareBin_Code'] = encoder2.fit_transform(test_data['FareBin'])
test_data['Title_Code'] = encoder2.fit_transform(test_data['Title'])

test_data.drop(['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], axis=1, inplace=True)

test_data.head()

Unnamed: 0,Pclass,SibSp,Parch,FamilySize,IsAlone,Sex_Code,Embarked_Code,AgeBin_Code,FareBin_Code,Title_Code
0,3,0,0,1,1,1,1,2,0,3
1,3,1,0,2,0,0,2,3,0,4
2,2,0,0,1,1,1,1,4,1,3
3,3,0,0,1,1,1,2,1,1,3
4,3,1,1,3,0,0,2,1,1,4


# Train Test Split

In [90]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Tree Classifier

In [76]:
tree_clf = DecisionTreeClassifier(max_depth=5)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [77]:
y_predict = tree_clf.predict(X_test)
print('Accuracy score', accuracy_score(y_test, y_predict))

Accuracy score 0.8161434977578476


In [78]:
y_predict = tree_clf.predict(test_data)

In [79]:
submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission.csv', index=False)

## KNN

In [86]:
KNN = KNeighborsClassifier(n_neighbors=5)
scaler = StandardScaler() 
X_norm = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.25, random_state=42)
KNN.fit(X_train, y_train)

test_data_norm = scaler.transform(test_data)
y_predict_knn = KNN.predict(test_data_norm)

submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict_knn})
submission.to_csv('./submissions/submission_knn.csv', index=False)

## Logistic Regression

In [95]:
log_reg = LogisticRegression(solver='lbfgs', penalty='l2')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
log_reg.fit(X_train, y_train)
y_predict_log = log_reg.predict(test_data)

submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict_log})
submission.to_csv('./submissions/submission_log_reg.csv', index=False)

In [38]:
train_data = train_data.drop(['Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1)

KeyError: "['Name' 'Cabin' 'Ticket' 'PassengerId'] not found in axis"

In [None]:
test_data = test_data.drop(['Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1)

In [None]:
#train_data['Cabin'] = train_data['Cabin'].replace(np.nan, 'C')
train_data['Embarked'] = train_data['Embarked'].replace(np.nan, 'X')
train_data['Age'] = train_data['Age'].replace(np.nan, -1)
train_data['Fare'] = train_data['Fare'].replace(np.nan, -1)

In [None]:
#test_data['Cabin'] = test_data['Cabin'].replace(np.nan, 'C')
test_data['Embarked'] = test_data['Embarked'].replace(np.nan, 'X')
test_data['Age'] = test_data['Age'].replace(np.nan, -1)
test_data['Fare'] = test_data['Fare'].replace(np.nan, -1)

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data['Sex'] = np.where(train_data['Sex']=='male',1,0)
test_data['Sex'] = np.where(test_data['Sex']=='male',1,0)

## Creating Dummy Variables

In [None]:
train_data['Embarked'] = pd.get_dummies(train_data['Embarked'], prefix_sep='_')
test_data['Embarked'] = pd.get_dummies(test_data['Embarked'], prefix_sep='_')
train_data.head()

In [None]:
test_data.head()

In [None]:
test_data.info()

## Train - Test - Split

In [None]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Decision Tree Regressor

In [None]:
tree_rg = DecisionTreeClassifier()
tree_rg.fit(X_train, y_train)

In [None]:
y_predict = tree_rg.predict(X_test)

In [None]:
scores = cross_val_score(tree_rg, X_train, y_train, scoring='accuracy', cv=5)
scores

In [None]:
print('Accuracy score', accuracy_score(y_test, y_predict))

## Logistic Regression

In [None]:
log_reg = LogisticRegression(solver='lbfgs', penalty='l2')

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_predict_lr = log_reg.predict(X_test)

In [None]:
print('Accuracy score', accuracy_score(y_test, y_predict_lr))

## KNN

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.25, random_state=42)
KNN = KNeighborsClassifier(n_neighbors=20)

KNN.fit(X_train, y_train)

y_predict_knn = KNN.predict(X_test)

print('Accuracy score', accuracy_score(y_test, y_predict_knn))

## Getting categorical data columns

In [None]:
categorical = train_data.select_dtypes(include=[object])
categorical.shape

In [None]:
categorical.head()

In [None]:
cat_columns = categorical.columns
cat_columns

## Label Encoding

In [None]:
label_encoder = LabelEncoder()
features = categorical.apply(label_encoder.fit_transform)
features.info()

## One Hot Encoding Categorical Data 

In [None]:
oneh_encoder = OneHotEncoder()
oneh_encoder.fit(features)

In [None]:
#one_hot_encoded = oneh_encoder.transform(features).toarray()
#one_hot_encoded.shape

In [None]:
#print(label_encoder.classes_)
OneHot = pd.DataFrame(oneh_encoder.transform(features),columns=list(label_encoder.classes_))
#OneHot = OneHot.set_index(train_data.index)
#categorical = pd.concat([train_data, OneHot], axis=1)
#OneHot.info()

In [None]:
train_data = train_data.drop(cat_columns, axis=1)
train_data.head()

In [None]:
train_data.join(one_hot_labels)