## IMPORTING NECESSARY MODELS

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler

## CLEANING DATA AND NORMALIZING IT

#### Getting Train and Test Data

In [2]:
train_data = pd.read_csv("../../titanic/train.csv")
test_data = pd.read_csv("../../titanic/test.csv")

In [3]:
print(train_data.info(), test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

#### Adding NaN data

- Removing "Age" with mode.
- Removing "Cabin" with 0.
- Removing "Embarked" with mode.
- Removing "Fare" with mean

In [4]:
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mode()[0])
train_data["Cabin"] = train_data["Cabin"].fillna(0)
train_data["Embarked"] = train_data["Embarked"].fillna(train_data["Embarked"].mode()[0])
train_data["Fare"] = train_data["Fare"].fillna(train_data["Fare"].mean())
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mode()[0])
test_data["Cabin"] = test_data["Cabin"].fillna(0)
test_data["Embarked"] = test_data["Embarked"].fillna(test_data["Embarked"].mode()[0])
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())

In [5]:
print(train_data.info(), test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

#### Removing Data that is not numbers

- We can replace "Name", "Ticket", "Cabin" with replace the number of letter

In [6]:
def replace_object(col):
    res = []
    for i in col:
        res.append(len(str(i)))
    return res

In [7]:
train_data["Name"] = replace_object(train_data["Name"])
train_data["Ticket"] = replace_object(train_data["Ticket"])
train_data["Cabin"] = replace_object(train_data["Cabin"])
test_data["Name"] = replace_object(test_data["Name"])
test_data["Ticket"] = replace_object(test_data["Ticket"])
test_data["Cabin"] = replace_object(test_data["Cabin"])

- We can replace "Sex" and "Embarked" with numbers for each class

In [8]:
def replace_classes(classes, col):
    res = []
    for i in col:
        res.append(classes.index(i))
    return res

In [9]:
train_data["Sex"] = replace_classes(["male", "female"], train_data["Sex"])
test_data["Sex"] = replace_classes(["male", "female"], test_data["Sex"])
train_data["Embarked"] = replace_classes(["C", "S", "Q"], train_data["Embarked"])
test_data["Embarked"] = replace_classes(["C", "S", "Q"], test_data["Embarked"])

In [10]:
print(train_data.info(), test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    int64  
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    int64  
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int64  
 11  Embarked     891 non-null    int64  
dtypes: float64(2), int64(10)
memory usage: 83.7 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 no

In [11]:
train_data.to_csv("../../data_man/final_train.csv")

#### Normalizing Data

In [12]:
scaler = MinMaxScaler()
norm = scaler.fit_transform(train_data)
train_norm_data = pd.DataFrame(norm, columns=train_data.columns)
norm = scaler.fit_transform(test_data)
test_norm_data = pd.DataFrame(norm, columns=test_data.columns)

In [13]:
print(train_norm_data.info(), test_norm_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    float64
 4   Sex          891 non-null    float64
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    float64
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    float64
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    float64
 11  Embarked     891 non-null    float64
dtypes: float64(12)
memory usage: 83.7 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    float64
 1   Pclass       418 non-null    

#### Data Cleaning Complete

## Splitting Data and Preparing For Training and testing

In [14]:
X, Y = train_norm_data.drop(["Survived"], axis=1), train_norm_data["Survived"]
x, x_test, y, y_test = train_test_split(X,Y,test_size=0.2,train_size=0.8)

In [26]:
perf = []
for i in range(1,100):
    mlp_class = MLPClassifier(hidden_layer_sizes=(10,i*2,i*4), max_iter=2000)
    mlp_class.fit(x_new,y)
    perf.append((mlp_class.score(x_new_test,y_test),i))
max(perf)

(0.8212290502793296, 66)

In [27]:
from sklearn.feature_selection import SequentialFeatureSelector
mlp_class = MLPClassifier(hidden_layer_sizes=(10,66*2,66*4), max_iter=2000)
sfs = SequentialFeatureSelector(mlp_class)
sfs.fit(x,y)
sfs.get_feature_names_out()

array(['Pclass', 'Name', 'Sex', 'Age', 'Embarked'], dtype=object)

In [36]:
x_new = sfs.transform(x)
mlp_class = MLPClassifier(hidden_layer_sizes=(10,66*2,66*4), max_iter=5000)
mlp_class.fit(x_new, y)
x_new_test = sfs.transform(x_test)
mlp_class.score(x_new_test,y_test)

0.8044692737430168

In [37]:
x_final = sfs.transform(X)
mlp_class = MLPClassifier(hidden_layer_sizes=(10,66*2,66*4), max_iter=50000)
mlp_class.fit(x_final, Y)

In [43]:
new_test = sfs.transform(test_norm_data)
final = mlp_class.predict(new_test)

In [46]:
res = []
for i in final:
    res.append(int(i))

In [48]:
pred_csv = pd.read_csv("../../data_man/predictions.csv")
pred_csv = pred_csv.drop(["Unnamed: 0"], axis=1)
pred_csv.insert(1, "Survived", res, True)
pred_csv.to_csv("pred.csv",index=False)