# Titanic Machine Learning Project

### Add Imports

In [57]:
#Imports

#data imports
import pandas as pd
import numpy as np

#machine learning imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns


### Create the Dataframe

In [58]:
data = pd.read_csv("data/titanic.csv")
data.info()
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### Fill in missing ages

In [59]:
def fill_missing_ages(df):
    age_fill_map = {}
    for pclass in df["Pclass"].unique():
        if pclass not in age_fill_map:
            age_fill_map[pclass] = df[df["Pclass"] == pclass]["Age"].median()

    df["Age"] = df.apply(lambda row: age_fill_map[row["Pclass"]] if pd.isnull(row["Age"]) else row["Age"], axis=1)


### Data Cleaning and Feature Engineering

In [60]:
def preprocess_data(df):
    # Delete all Columns/data aren't relevant for what we're trying to accomplish
    df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], inplace=True)

    #Fill in missing data from Embarked
    # df["Embarked"].fillna("S", inplace=True)
    # df.drop(columns=["Embarked"], inplace=True)
    
    fill_missing_ages(df)

    # Convert Gender to binary for machine learning
    df["Sex"] = df["Sex"].map({"male":1, "female":0})

    # Feature Engineering: Creating new columns in our data to help enhance the model
    df["FamilySize"] = df["SibSp"] + df["Parch"]
    df["IsAlone"] = np.where(df["FamilySize"] == 0, 1, 0)
    df["FareBin"] = pd.qcut(df["Fare"], 4, labels=False)
    df["AgeBin"] = pd.cut(df["Age"], bins=[0,12,20,40,60, np.inf], labels=False)

    return df




### Update and check the Data

In [61]:
data = preprocess_data(data)

#display(data.head(10))
display(data.sample(10))
#display(data.tail(10))

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,FareBin,AgeBin
316,1,2,0,24.0,1,0,26.0,1,0,2,2
519,0,3,1,32.0,0,0,7.8958,0,1,0,2
304,0,3,1,24.0,0,0,8.05,0,1,1,2
886,0,2,1,27.0,0,0,13.0,0,1,1,2
860,0,3,1,41.0,2,0,14.1083,2,0,1,3
47,1,3,0,24.0,0,0,7.75,0,1,0,2
400,1,3,1,39.0,0,0,7.925,0,1,1,2
734,0,2,1,23.0,0,0,13.0,0,1,1,2
62,0,1,1,45.0,1,0,83.475,1,0,3,3
782,0,1,1,29.0,0,0,30.0,0,1,2,2


### Create Features / Target Variables (Make Flashcards)

In [62]:
X = data.drop(columns=["Survived"])
y = data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### ML Preprocessing
When we take our data and we make sure it's numerically formatted for the model to understand

In [63]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)