In [None]:
import pandas as pd

# Loading Dataset using pandas library
train = pd.read_csv("/kaggle/input/titanicdataset/train.csv")
test = pd.read_csv("/kaggle/input/titanicdataset/test.csv")

# Checking data (Important to understand the context)
print(train.head())
print(train.info())
print(train.describe())

## Data Cleaning

In [None]:
#Checking if there's missing data
print(train.isnull().sum())

There are missing values in age, Cabin and Embarked

* Age has 177 missed values which is relatively few, we can replace them with the median.
* Cabin has 687 missed values out of 891 which is too many. I think it makes sense to remove this column.
* Embarked has 2 missed values. That's negligible, we can replace it by the mode (most frequent category)

**Replacing Age by its median :**
And why not use the mean ? because the mean is sensitive to extreme values.The Median is more similar to the population

In [None]:
# Replacing age missing values by its median
train["Age"] = train["Age"].fillna(train["Age"].median())

# Also the same to test.csv
test["Age"] = test["Age"].fillna(train["Age"].median())

**Removing Cabin due to excessive missing values** And why removing it ? Because over 75% of the values are missing, making the feature useless for the prediction.

In [None]:
# Removing Cabin's column from train and test data frame
train = train.drop(columns=["Cabin"])
test = test.drop(columns=["Cabin"])


**Remplacing Embarked by its most frequent value** And why ? Since Embarked only has 3 possible values,  We'll replace it with the most frequent category (mode) to ensures minimal impact on the dataset

In [None]:
# Replacing Embarked missing values by its mode
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])

# Same for test data frame
test["Embarked"] = test["Embarked"].fillna(test["Embarked"].mode()[0])

In [None]:
# Replacing one passenger fare missing value test data frame
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

**Checking if Data cleaning worked**

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())
# We can see that there are are no more missing values

Machine Learning systems can not read and understand datatypes like String. So we must transform our data to numeric types.

In [None]:
# Remember which data need to be transformed to be understood by Machine Learning systems
print(train.dtypes)
print(test.dtypes)

## Feature Engineering

We need to convert 'Sex', 'Embarked' and 'Pclass' into numerical representations that our model can process

In [None]:
# Sex (into binary)
train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})
print(train["Sex"].head())
print(test["Sex"].head())
print(train.head())
print(test.head())

In [None]:
# Embarked (One hot encoding)
# => Creating separate binary columns, because none of the 3 possible values is better than the others
train = pd.get_dummies(train, columns=["Embarked"], drop_first=True)
test = pd.get_dummies(test, columns=["Embarked"], drop_first=True)
print(test.head())
print(test.head())

In [None]:
# What Name column looks like ?
print(train["Name"])

# We notice that even if the name doesn't mean anything to predict if a passenger will survive, we still can extract "Mrs.", "Miss." etc. This could improve our model.

# We will first extract the title by creating a new column
train["Title"] = train["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
test["Title"] = test["Name"].str.extract("([A-Za-z]+)\.", expand=False)
print(train["Title"])
print(train["Name"])

# As we can see, we now get a new column named "Title". I extracted the name before "." thank to "\." in the previous code

In [None]:
print(train.head())
# We notice that we see for the first one "Mr", then "Mrs" for the second... So it worked!

print(train["Title"].value_counts())

Titles like Jonkheer, Don, Mme, Lady… appear only once. All these rare titles will be grouped under the same name.

In [None]:
title_mapping = {
    "Mr" : "Mr", "Miss" : "Miss", "Mrs" : "Mrs", "Master" : "Master", "Dr" : "Dr", "Rev" : "Rare", "Mlle" : "Rare", "Major" : "Rare", "Col" : "Rare", "Countess" : "Rare", "Capt" : "Rare", "Ms" : "Rare", "Sir" : "Rare", "Lady" : "Rare", "Mme" : "Rare", "Don" : "Rare", "Jonkheer" : "Rare"
}
train["Title"] = train["Title"].map(title_mapping)
test["Title"] = test["Title"].map(title_mapping)
print(train["Title"].value_counts())
print(train["Title"].value_counts())

In [None]:
# Checking if our dataset is still logic as usual, using head or columns pandas methods
print(train.columns)
print(train.head())

In [None]:
# Looks great, now we need to do a "one-hot encode" of the Title column

train = pd.get_dummies(train, columns=["Title"], drop_first=True)
test = pd.get_dummies(test, columns=["Title"], drop_first=True)
print(train.columns)
print(test.columns)
print(train.head())

Our Dataset is ready because PassengerId, Ticket and Name column won't be used in the model (They are going to be removed next). So all the data are numeric and can be trained by the model.

In [None]:
# Keeping PassengerId that will be useful to create submission.csv later
test_ID = test["PassengerId"]

# Removing useless columns
train = train.drop(columns=["Name", "Ticket", "PassengerId"])
test = test.drop(columns=["Name", "Ticket", "PassengerId"])

print(train.dtypes)
print(test.dtypes)

## Splitting Data : Training and Validation Sets

To evaluate our model properly, we divide the dataset into a training set (80%) and a validation set (20%).

In [None]:
X_train = train.drop(columns=["Survived"])
Y_train = train["Survived"]
print(Y_train.head())
print(X_train.head())

In [None]:
from sklearn.model_selection import train_test_split

# Spliting (80-20)
X_train_sub, X_val, Y_train_sub, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
# Why 42 ? Meaning of life !

## Logistic Regression

Why ? Simple and Efficient for Binary classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Model initiation
model = LogisticRegression(max_iter=1200)

# Training on sub data
model.fit(X_train_sub, Y_train_sub)

# Prediction on val data
Y_pred = model.predict(X_val)

# Exam
accuracy = accuracy_score(Y_val, Y_pred)
print(f"Precision du modèl : {accuracy}")

I'm happy to get 81,5%. I'll consider this model as successfull

In [None]:
# Checking is test file is comparable to the one we used.
print(test.shape)
print(X_train_sub.shape)
print(test.columns)
print(X_train_sub.columns)

# Same columns, looks good

In [None]:
# Modelising
Y_test_pred = model.predict(test)

In [None]:
# Creating gender_submission.csv
submission = pd.DataFrame({
    "PassengerID" : test_ID,
    "Survived" : Y_test_pred
})

In [None]:
submission.to_csv("submission.csv", index=False)