In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import seaborn as sns
import os

import tensorflow as tf

# Data

## Data EDA

In [None]:
data_train = pd.read_csv("train.csv")
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
data_train[['Last_name','First_name']] = data_train['Name'].str.split(', ', n=1, expand=True)

data_train['Last_name'] = data_train['Last_name'].str.strip()
data_train['First_name'] = data_train['First_name'].str.strip()

In [None]:
data_train.drop('Name', axis=1, inplace=True)

In [None]:
data_train['Cabin'].unique()

In [None]:
data_train_encoded = pd.get_dummies(data_train, columns=['Sex','Embarked'], dtype=int)

In [None]:
data_train_numerical = data_train_encoded.drop(['Ticket','Cabin','Last_name','First_name'], axis=1)

In [None]:
corr_matrix = data_train_numerical.corr()

plt.figure(figsize=(10,6))

sns.heatmap(
    corr_matrix,
    vmin=-1,      # Minimum correlation value
    vmax=1,      # Maximum correlation value
    center=0,    # Center the color bar at 0
    cmap='coolwarm', # Colormap (e.g., 'coolwarm', 'viridis', 'YlGnBu')
    square=True, # Ensure square cells
    annot=True,  # Add correlation values (annotations) to the cells
    fmt='.2f',   # Format the annotations to 2 decimal places
    linewidths=.5 # Add spacing between cells
)

plt.title('Correlation Map of Variables')
plt.show()

## Feature Engineering with column 'Age'

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(data_train['Age'][data_train['Survived']==0], kde=True)

plt.title("Titanic's Survived Passenger's Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")

plt.show()

In [None]:
data_train_numerical_new = data_train_numerical.copy()
data_train_numerical_new['Age'] = pd.qcut(data_train_numerical['Age'], q=20)

In [None]:
data_train_numerical_new['Age'].unique()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(data_train_encoded['Age'][data_train_encoded['Survived']==1], kde=True)

plt.title("Titanic's Survived Passenger's Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")

plt.show()

In [None]:
# Adding TITLE column
data_train_encoded['Title'] = data_train_encoded['First_name'].str.extract('([A-Za-z]+)\.', expand=False)

data_train_encoded['Title'] = data_train_encoded['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

###  Missing Values

In [None]:
data_train_encoded['Age'] = data_train_encoded.groupby(['Pclass', 'Sex_male', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))

data_train_encoded['Age'] = data_train_encoded.groupby(['Sex_male', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))

data_train_encoded['Age'] = data_train_encoded.groupby(['Title'])['Age'].transform(lambda x: x.fillna(x.median()))

In [None]:
data_train_encoded['Title'] = data_train_encoded['Title'].replace(['Lady','Countess','Capt','Col','Don',
                                                   'Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(data_train_encoded['Age'][data_train_encoded['Survived']==1], kde=True)

plt.title("Titanic's Survived Passenger's Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")

plt.show()

## Feature Engineering with column 'Ticket'

In [None]:
prefix = data_train_encoded['Ticket'].apply(lambda x: x.split()[0] if len(x.split()) > 1 else 'None')
prefix = prefix.str.replace(r'[./]', '', regex=True).str.upper()
    
# Show how many unique prefixes there are there and their counts
prefix_counts = prefix.value_counts()
print(prefix_counts)
print(f"\nTotal unique prefixes in training set:", len(prefix_counts))

In [None]:
ticket_counts = data_train_encoded['Ticket'].value_counts()
    
# 1. Total number of duplicate tickets (tickets shared by 2+ people)
duplicates = ticket_counts[ticket_counts >= 2]
print(f"Tickets that appear 2 or more times       : {len(duplicates)}")
print(f"Passengers traveling with someone (same ticket): {duplicates.sum()}")
print(f"Passengers with completely unique ticket : {len(data_train_encoded) - duplicates.sum()}")

In [None]:
print(f"\nTop 15 most common tickets:")
print(duplicates.head(20).to_string())

In [None]:
data_train_encoded[data_train_encoded['Ticket']=='A/5 21171']
# Many of these duplicate tickets indicate that their Pclass are in lower class(3)

In [None]:
data_train_encoded['Ticket_prefix'] = data_train_encoded['Ticket'].apply(lambda x: x.split()[0] if len(x.split()) > 1 else 'None')
# Clean some obvious cases
data_train_encoded['Ticket_prefix'] = data_train_encoded['Ticket_prefix'].str.replace('.', '').str.replace('/', '').str.upper()
data_train_encoded['Ticket_prefix'] = data_train_encoded['Ticket_prefix'].replace(['A5','A4','AQ3','AQ4','AS','C','CA','CASOTON','FC','FCC',
                                                       'FA','LP','PP','PPP','SC','SCA3','SCA4','SCAH','SCOW',
                                                       'SOP','SOPP','SOTONO','SP','STONO','SWPP','WEP','WC'], 'Rare')
data_train_encoded['Ticket_prefix'] = data_train_encoded['Ticket_prefix'].replace(['A','SOTONOQ','STONO2','SOTONO2','STONOQ','WEP','WC','SCPARIS','SOC','SOP','PPP'], 'Rare')
    
# 2. Ticket numeric part length â€” this is surprisingly powerful
data_train_encoded['Ticket_len'] = data_train_encoded['Ticket'].apply(lambda x: len(x.replace(' ','').replace('.','').replace('/','')) )
    
# 3. Purely numeric ticket? (many cheap tickets are just numbers)
data_train_encoded['Ticket_numeric'] = data_train_encoded['Ticket'].apply(lambda x: 1 if x.replace(' ','').isdigit() else 0)

data_train_encoded

In [None]:
data_train_encoded['Ticket_prefix'].value_counts()

## Feature Engineering with column 'Cabin'

In [None]:
data_train_encoded['HasCabin'] = data_train_encoded['Cabin'].notna().astype(int)
data_train_encoded

## Finishing

In [None]:
def plot_spread(col, bins=30):
    fig, axes = plt.subplots(1, 3, figsize=(16,4))
    
    # Histogram + KDE
    sns.histplot(data=data_train_encoded, x=col, kde=True, bins=bins, ax=axes[0], alpha=0.7)
    axes[0].set_title(f'{col} - Distribution')
    
    # Boxplot
    sns.boxplot(y=data_train_encoded[col], ax=axes[1])
    axes[1].set_title(f'{col} - Boxplot')
    
    # Survival rate by quantile (if Survived exists)
    if 'Survived' in data_train_encoded.columns:
        temp = data_train_encoded.copy()
        temp['q'] = pd.qcut(temp[col], q=30, duplicates='drop')
        surv = temp.groupby('q')['Survived'].mean()
        surv.plot(kind='bar', ax=axes[2], color='salmon')
        axes[2].set_title(f'Survival rate by {col}')
        axes[2].set_xticklabels([f'{i:.1f}' for i in surv.index.categories.mid], rotation=45)
    
    plt.tight_layout()
    plt.show()

# Try it
plot_spread('Fare')

In [None]:
data_train_final = data_train_encoded.drop(['Cabin','Ticket','Sex_female','First_name','Last_name'], axis=1, errors='ignore')

In [None]:
data_train_final = pd.get_dummies(data_train_final, columns=['Title','Ticket_prefix'], dtype=int)

In [None]:
data_train_final[data_train_final['Fare']>500]

# Model

## ...

In [None]:
X = data_train_final.drop('Survived', axis=1).copy()
y = data_train_final['Survived'].copy()

In [None]:
logreg = LogisticRegression(
    penalty='l2',        
    C=0.9,                 
    solver='liblinear',    
    max_iter=1000,
    random_state=42
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
logreg.fit(X_train, y_train)
val_pred = logreg.predict(X_val)
val_proba = logreg.predict_proba(X_val)[:,1]

print("Logistic Regression Results")
print(f"Accuracy : {accuracy_score(y_val, val_pred):.5f}")
print(f"ROC AUC  : {roc_auc_score(y_val, val_proba):.5f}")

In [None]:
print("Confusion Matrix:")
cm = confusion_matrix(y_val, val_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels='01', yticklabels='01')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Prediction

In [None]:
# data_train_final["Fare"] = data_train_final["Fare"].fillna(data_train_final["Fare"].median())

In [None]:
# data_train_final.info()

In [None]:
# predictions = logreg.predict(data_train_final)

In [None]:
# submission = pd.DataFrame({
#     "PassengerId": data_train_final["PassengerId"],
#     "Survived": predictions
# })

In [None]:
# submission.to_csv("submission.csv", index=False)
# print("submission.csv saved! ðŸŽ‰")
# print(submission["Survived"].value_counts())