In [12]:
"""Titanic dataset analysis."""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

In [13]:
data = pd.read_csv('datasets/Titanic-Dataset.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
# Drop unnecessary columns
data = data.drop(['PassengerId'], axis=1)
data = data.drop(['Ticket'], axis=1)
data = data.drop(['Cabin'], axis=1)
data = data.drop(['Embarked'], axis=1)
data = data.drop(['Parch'], axis=1)
data = data.drop(['Name'], axis=1)
data = data.drop(['Age'], axis=1)
data = data.drop(['SibSp'], axis=1)
data

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,male,7.2500
1,1,1,female,71.2833
2,1,3,female,7.9250
3,1,1,female,53.1000
4,0,3,male,8.0500
...,...,...,...,...
886,0,2,male,13.0000
887,1,1,female,30.0000
888,0,3,female,23.4500
889,1,1,male,30.0000


In [15]:
data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
data

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,1,7.2500
1,1,1,0,71.2833
2,1,3,0,7.9250
3,1,1,0,53.1000
4,0,3,1,8.0500
...,...,...,...,...
886,0,2,1,13.0000
887,1,1,0,30.0000
888,0,3,0,23.4500
889,1,1,1,30.0000


In [16]:
# Mean value for Fare
mean_fare = data['Fare'].mean()
data['Fare'] = data['Fare'].fillna(mean_fare)
data

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,1,7.2500
1,1,1,0,71.2833
2,1,3,0,7.9250
3,1,1,0,53.1000
4,0,3,1,8.0500
...,...,...,...,...
886,0,2,1,13.0000
887,1,1,0,30.0000
888,0,3,0,23.4500
889,1,1,1,30.0000


In [17]:
# Apply log transformation to Fare to reduce skewness
data['Fare'] = np.log1p(data['Fare'])

In [18]:
# Feature selection
X = data.drop('Survived', axis=1)
y = data['Survived']
selector = SelectKBest(score_func=f_classif, k=4)
X_selected = selector.fit_transform(X, y)
X = pd.DataFrame(X_selected, columns=X.columns[selector.get_support()])



In [19]:
# Split the data into features and target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
models = []

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)
logistic_f1 = f1_score(y_test, logistic_predictions)
models.append((logistic_model, logistic_f1))

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_f1 = f1_score(y_test, rf_predictions)
models.append((rf_model, rf_f1))

# Sort results by F1 score in descending order
models.sort(key=lambda x: x[1])

# Display results
print("\nModel Performance (sorted by F1 Score):")
for model, f1 in models:
    print(f"Model: {model.__class__.__name__}, F1 Score: {f1:.4f}")


Model Performance (sorted by F1 Score):
Model: LogisticRegression, F1 Score: 0.7273
Model: RandomForestClassifier, F1 Score: 0.7692


In [None]:
### Titanic Disaster Prediction Review

By analyzing the data, we can observe how different characteristics, such as class, gender, and fare, impacted the likelihood of survival.

One of the most significant factors is **gender**. Female passengers had a much higher probability of survival compared to males, as women and children were prioritized during evacuation. This is evident from the transformation of the `Sex` column, where females were encoded as `0` and males as `1`.

Another critical factor is **passenger class (Pclass)**. First-class passengers had a higher survival rate compared to those in second and third class. This disparity reflects the social hierarchy and access to lifeboats, as first-class passengers were closer to the deck and had better accommodations.

The **fare** paid by passengers also correlates with survival. Higher fares, often associated with first-class tickets, indicate a greater likelihood of survival. To reduce skewness in the data, a log transformation was applied to the `Fare` column, ensuring a more balanced analysis.

The dataset underwent significant preprocessing, including the removal of irrelevant columns like `Name`, `Ticket`, and `Cabin`, as well as handling missing values. For example, the mean fare was used to fill missing values in the `Fare` column.

Feature selection was performed to identify the most influential variables, resulting in the selection of `Pclass`, `Sex`, and `Fare`. These features were used to train machine learning models, including Logistic Regression and Random Forest Classifier.

The **Random Forest Classifier** outperformed **Logistic Regression** in terms of **F1 score**, achieving a score of **0.769** compared to **0.727**.

In conclusion, the analysis highlights the importance of gender, class, and fare in determining survival probabilities.

SyntaxError: unterminated string literal (detected at line 3) (3088916973.py, line 3)