In [1]:
import pandas as pd
import joblib


In [2]:
df = pd.read_csv("train.csv")

In [3]:
import numpy as np
numeric_cols = df.select_dtypes(include=[np.number])
skew_values = numeric_cols.skew()
# print(skew_values)

In [None]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
df["Cabin"] = df["Cabin"].fillna("Unknown")
df["CabinDeck"] = df["Cabin"].apply(lambda x: x[0] if x != "Unknown" else "Unknown")
# df["CabinDeck"].unique()
# df["Cabin"] != "Unknown" # agr unknonw k braber nh hy tu wo true hoga
df["HasCabin"] = (df["Cabin"] != "Unknown").astype(int)

# Handled missing value age, Embarked and cabins
df["Sex_mapped"] = df["Sex"].map({"male": 0, "female": 1})
print(df["Sex_mapped"])
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = 0
df.loc[df["FamilySize"] == 1, "IsAlone"] = 1


# 1) extract Title (text before the dot, after a space)
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip()
# 2) standardize some titles
df['Title'] = df['Title'].replace({
    'Mlle': 'Miss',
    'Ms': 'Miss',
    'Mme': 'Mrs'
})
# 3) group rare titles into 'Rare'
rare_titles = ['Dr','Rev','Col','Major','Capt','Lady','Sir','Don','Dona','Countess','Jonkheer','the Countess']
df['Title'] = df['Title'].apply(lambda x: 'Rare' if x in rare_titles else x)


# now you can inspect unique titles and survival rates
# print(df['Title'].unique())
# print(df.groupby('Title')['Survived'].mean().sort_values(ascending=False))

# df = pd.get_dummies(df, columns=['Title'], drop_first=True)


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex_mapped, Length: 891, dtype: int64


In [5]:
df = pd.get_dummies(df, columns=['Embarked', 'CabinDeck'], drop_first=False)

# 1. Pclass → convert to category
df["Pclass"] = df["Pclass"].astype("category")

# 4. HasCabin → ensure int64 (for consistency)
df["HasCabin"] = df["HasCabin"].astype("int64")

# 5. Title dummies → convert from bool → int64
df[["Title_Miss", "Title_Mr", "Title_Mrs", "Title_Rare"]] = \
    df[["Title_Miss", "Title_Mr", "Title_Mrs", "Title_Rare"]].astype("int64")

# Convert Embarked dummy columns from bool → int64
df[["Embarked_C", "Embarked_Q", "Embarked_S"]] = \
    df[["Embarked_C", "Embarked_Q", "Embarked_S"]].astype("int64")

# Convert CabinDeck dummy columns from bool → int64
df[["CabinDeck_A", "CabinDeck_B", "CabinDeck_C"]] = \
    df[["CabinDeck_A", "CabinDeck_B", "CabinDeck_C"]].astype("int64")

# Find all boolean columns
bool_cols = df.select_dtypes(include=["bool"]).columns

# Convert them to int64
df[bool_cols] = df[bool_cols].astype("int64")




In [6]:
# print(df.info())
# df.isnull().sum()
# df.describe()
# df.shape
# df.duplicated().sum()
# df[df.duplicated()]
# dupes = df[df.duplicated(keep=False)]  # show all duplicates, not just first
# print(dupes.head(20))



In [7]:
df = df.drop(columns=['Name'])
df = df.drop(columns=['PassengerId'])
df = df.drop(columns=['Cabin'])
df = df.drop(columns=['Ticket'])
df = df.drop(columns=['Sex'])
# print(df.columns.value_counts())


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])
# df["Fare"]

In [9]:
from sklearn.model_selection import train_test_split

X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)

accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.8268156424581006
Precision: 0.8064516129032258
Recall: 0.7246376811594203
F1 Score: 0.7633587786259542


In [12]:
import json

metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

with open("metrics.json", "w") as f:
    json.dump(metrics, f)


In [13]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[98 12]
 [19 50]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       110
           1       0.81      0.72      0.76        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.83      0.83      0.82       179



In [14]:
import joblib
joblib.dump(model, "titanic_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [15]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Load your trained model
model = joblib.load("titanic_model.pkl")
scaler = joblib.load("scaler.pkl")


In [16]:
example = {
    'Pclass': 3,
    'Age': 18,
    'SibSp': 1,
    'Parch': 0,
    'Fare': 200.25,
    'HasCabin': 0,
    'Sex_mapped': 1,   # 1 = male, 0 = female (example)
    'FamilySize': 1,
    'IsAlone': 0,
    'Title_Miss': 1,
    'Title_Mr': 0,
    'Title_Mrs': 0,
    'Title_Rare': 0,
    'Embarked_C': 0,
    'Embarked_Q': 0,
    'Embarked_S': 1,
    'CabinDeck_A': 1,
    'CabinDeck_B': 0,
    'CabinDeck_C': 0,
    'CabinDeck_D': 0,
    'CabinDeck_E': 0,
    'CabinDeck_F': 0,
    'CabinDeck_G': 0,
    'CabinDeck_T': 0,
    'CabinDeck_Unknown': 0
}
# Convert into DataFrame
X_new = pd.DataFrame([example])

# Ensure same column order as training
X_new = X_new[model.feature_names_in_]

X_new[['Age', 'Fare']] = scaler.transform(X_new[['Age', 'Fare']])


# Predict
pred_class = model.predict(X_new)[0]
pred_prob  = model.predict_proba(X_new)[0]

print("Predicted class:", pred_class)   # 0 = did not survive, 1 = survived
print("Predicted probabilities:", pred_prob)

Predicted class: 1
Predicted probabilities: [0.16985145 0.83014855]
