In [1]:
# https://www.kaggle.com/c/titanic/data --> Kaggle Titanic Dataset 
import pandas as pd # import pandas for dataframe
from sklearn import preprocessing 

In [2]:
train = pd.read_csv('../Titanic Data/train.csv')
test = pd.read_csv('../Titanic Data/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
train = train.drop(columns=['Ticket', 'PassengerId', 'Name', 'Cabin'])
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Why I dropped the above columns 
- Ticket: hard to process (each ticket is unique), does not add any new information that Pclass does not provide. 
- PassengerID: effectively is just the row number in the data, which is not helpful. 
- Name: hard to process (each name is unique), does not add any new information that Sex does not provide. 
- Cabin: hard to process (mainly just null values), does not add any new information that Pclass does not provide.

In [4]:
def convert_sex(sex):
    if sex == 'male':
        return 0
    elif sex == 'female':
        return 1
    else:
        print(f"ERROR: SEX WAS NEITHER MALE NOR FEMALE {sex}")
        
train['Sex'] = train['Sex'].apply(convert_sex)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [5]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features/43589167#43589167
ohe_class = preprocessing.OneHotEncoder(sparse=False) 
one_hot_encoded_class = ohe_class.fit_transform(train['Pclass'].values.reshape(-1, 1))
train['Upper Class'] = one_hot_encoded_class[:, 0]
train['Middle Class'] = one_hot_encoded_class[:, 1]
train['Lower Class'] = one_hot_encoded_class[:, 2]
train = train.drop(columns=['Pclass'])
train.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked,Upper Class,Middle Class,Lower Class
0,0,0,22.0,1,0,7.25,S,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,C,1.0,0.0,0.0
2,1,1,26.0,0,0,7.925,S,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,S,1.0,0.0,0.0
4,0,0,35.0,0,0,8.05,S,0.0,0.0,1.0


In [6]:
embarked_vals = train['Embarked'].values
print(embarked_vals) 
# lovely, there are two entries that are NaN that have to be dealt with... 

['S' 'C' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S'
 'S' 'C' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'C' 'Q' 'S' 'C' 'S'
 'C' 'S' 'S' 'C' 'S' 'S' 'C' 'C' 'Q' 'S' 'Q' 'Q' 'C' 'S' 'S' 'S' 'C' 'S'
 'C' 'S' 'S' 'C' 'S' 'S' 'C' nan 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'Q' 'S' 'C' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'C'
 'Q' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'Q'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'Q' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'Q' 'S'
 'Q' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'Q' 'C' 'S' 'S' 'S' 'S' 'Q' 'C'
 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'C' 'Q' 'S

In [7]:
# below code determines which port is the mode, and sets the NaN datapoints to it
C = 0
Q = 0
S = 0
typical_port = 'C'
NaN_indexes = []
index = 0
for port in train['Embarked'].values:
    if port == 'C':
        C += 1
    elif port == 'Q':
        Q += 1
    elif port == 'S':
        S += 1
    else:
        NaN_indexes.append(index)
    index += 1
if Q > C and Q > S:
    typical_port = 'Q'
elif S > Q and S > C:
    typical_port = 'S'
for NaN_index in NaN_indexes:
    embarked_vals[NaN_index] = typical_port
print(embarked_vals)

['S' 'C' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S'
 'S' 'C' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'C' 'Q' 'S' 'C' 'S'
 'C' 'S' 'S' 'C' 'S' 'S' 'C' 'C' 'Q' 'S' 'Q' 'Q' 'C' 'S' 'S' 'S' 'C' 'S'
 'C' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'Q' 'S' 'C' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'C'
 'Q' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'Q'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'Q' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'Q' 'S'
 'Q' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'Q' 'C' 'S' 'S' 'S' 'S' 'Q' 'C'
 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'C' 'Q' 'S

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder.fit_transform
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features/43589167#43589167
embarked_label_encoded = preprocessing.LabelEncoder().fit_transform(embarked_vals).reshape(-1, 1)
ohe_embarked = preprocessing.OneHotEncoder(sparse=False)
one_hot_encoded_embarked = ohe_embarked.fit_transform(embarked_label_encoded)
train['Embarked Cherbourg'] = one_hot_encoded_embarked[:, 0]
train['Embarked Queenstown'] = one_hot_encoded_embarked[:, 1]
train['Embarked Southampton'] = one_hot_encoded_embarked[:, 2]
train.drop(columns=['Embarked'])
train.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked,Upper Class,Middle Class,Lower Class,Embarked Cherbourg,Embarked Queenstown,Embarked Southampton
0,0,0,22.0,1,0,7.25,S,0.0,0.0,1.0,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,C,1.0,0.0,0.0,1.0,0.0,0.0
2,1,1,26.0,0,0,7.925,S,0.0,0.0,1.0,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,S,1.0,0.0,0.0,0.0,0.0,1.0
4,0,0,35.0,0,0,8.05,S,0.0,0.0,1.0,0.0,0.0,1.0
