In [1]:
# Brijesh Kumar
# ASU ID: 1235332269
# CSE 572 - Data Mining

In [2]:
import re
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Models (same 9 as the assignment)
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
# ------------------------
# 1) Load & target split
# ------------------------
train_df = pd.read_csv("Data/train.csv")
y = train_df["Survived"].astype(int)
X = train_df.drop(columns=["Survived"]).copy()

In [4]:
# ------------------------
# 2) Tiny-but-mighty features
# ------------------------
def extract_title(name: str) -> str:
    if not isinstance(name, str):
        return "Unknown"
    m = re.search(r",\s*([^.,]+)\.", name)
    t = (m.group(1).strip() if m else "Unknown")
    if t in {"Mlle", "Ms"}: t = "Miss"
    if t == "Mme": t = "Mrs"
    if t in {"Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"}:
        t = "Rare"
    return t

X["Title"] = X["Name"].apply(extract_title)
X["FamilySize"] = X["SibSp"] + X["Parch"] + 1
X["IsAlone"] = (X["FamilySize"] == 1).astype(int)
X["CabinPresent"] = X["Cabin"].notna().astype(int)
X["FarePerPerson"] = X["Fare"] / X["FamilySize"]
X["Pclass"] = X["Pclass"].astype("category")  # treat as categorical

In [5]:
# ---------------------------------------
# 3) Simple, effective group imputations
#    (kept minimal; big win vs global medians)
# ---------------------------------------
# Embarked -> most frequent (global)
X["Embarked"] = X["Embarked"].fillna(X["Embarked"].mode().iloc[0])

# Fare -> median within (Pclass, Embarked)
X["Fare"] = X["Fare"].fillna(
    X.groupby(["Pclass","Embarked"])["Fare"].transform("median")
)

# Age -> median within (Title, Pclass)
X["Age"] = X["Age"].fillna(
    X.groupby(["Title","Pclass"])["Age"].transform("median")
)

# Safety: any stragglers
for col, val in [("Fare", X["Fare"].median()), ("Age", X["Age"].median())]:
    X[col] = X[col].fillna(val)

In [6]:
# ------------------------
# 4) Columns & transformer
# ------------------------
num_cols = ["Age","SibSp","Parch","Fare","FamilySize","IsAlone","FarePerPerson"]
cat_cols = ["Sex","Embarked","Pclass","Title","CabinPresent"]

num_pipe = SimpleImputer(strategy="median")  # already imputed; this is safety
cat_pipe = make_pipeline(SimpleImputer(strategy="most_frequent"),
                         OneHotEncoder(handle_unknown="ignore", sparse_output=False))

prep_no_scale = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# For margin/distance models, we add StandardScaler to numerics only
prep_with_scale = ColumnTransformer(
    transformers=[
        ("num", make_pipeline(SimpleImputer(strategy="median"), StandardScaler()), num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

In [7]:
# ------------------------
# 5) Models (tiny nudges, not "tuning")
#    - scaled where it matters
#    - everything else stays simple
# ------------------------
models = {
    "Support Vector Machines": make_pipeline(prep_with_scale, SVC(kernel="rbf", C=2.0, gamma="scale", random_state=42)),
    "KNN": make_pipeline(prep_with_scale, KNeighborsClassifier(n_neighbors=7, weights="distance")),
    "Logistic Regression": make_pipeline(prep_with_scale, LogisticRegression(max_iter=1000, random_state=42)),
    "Random Forest": make_pipeline(prep_no_scale, RandomForestClassifier(n_estimators=400, random_state=42)),
    "Naive Bayes": make_pipeline(prep_no_scale, GaussianNB()),
    "Perceptron": make_pipeline(prep_with_scale, Perceptron(alpha=1e-4, max_iter=2000, random_state=42)),
    "Stochastic Gradient Descent": make_pipeline(prep_with_scale, SGDClassifier(loss="log_loss", alpha=1e-4, max_iter=2000, random_state=42)),
    "Linear SVC": make_pipeline(prep_with_scale, LinearSVC(C=1.5, max_iter=5000, random_state=42)),
    "Decision Tree": make_pipeline(prep_no_scale, DecisionTreeClassifier(max_depth=6, random_state=42)),
}

In [8]:
# ------------------------
# 6) Train-set scores 
# ------------------------
scores = []
for name, pipe in models.items():
    pipe.fit(X, y)
    acc = round(pipe.score(X, y) * 100, 2)  
    scores.append((name, acc))

results = (pd.DataFrame(scores, columns=["Model","Improved_Score_%"])
           .sort_values("Improved_Score_%", ascending=False)
           .reset_index(drop=True))
print(results.to_string(index=False))

                      Model  Improved_Score_%
                        KNN             98.77
              Random Forest             98.77
              Decision Tree             87.77
    Support Vector Machines             84.85
                 Linear SVC             83.50
        Logistic Regression             83.39
                Naive Bayes             82.15
Stochastic Gradient Descent             80.81
                 Perceptron             79.35
