In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv(r'Data/train.csv')
test_df = pd.read_csv(r'Data/test.csv')
combine = [train_df, test_df]

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [6]:
train_df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

In [8]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


#### Step 1:

Now we will extract the titles from the names column, for example Mlle, Mme and Ms and replace them with the names known to us like Miss and Mrs. and other titles like Dr. Colnel and all will be replaced with the name "Special" because of their special important status.

In [10]:
def add_title(df: pd.DataFrame) -> pd.DataFrame:

    # We will select only the Name column and the string before the . to get the titles of each person.
    title = df['Name'].str.extract(r',\s*([^.]*)\.', expand=False).str.strip()

    # Now we will replace some of the titles like Mlle and Ms to Miss and Mme to Mrs to make it easy for us to read.
    title = title.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

    # These are the titles which will not be changed and if the column contains these values, we will create a new column titled "title".
    unchanged_names = {'Mr', 'Mrs', 'Miss', 'Master'}
    title = title.where(title.astype(str).isin(unchanged_names), 'Special')

    df['Title'] = title
    return df

In [11]:
# Applying the function to the train and test dataframes.

train_df = add_title(train_df)
test_df  = add_title(test_df)

In [12]:
# checking if the changes have been applied.

print(train_df['Title'].value_counts(dropna=False))

Title
Mr         517
Miss       185
Mrs        126
Master      40
Special     23
Name: count, dtype: int64


In [13]:
#  checking if the changes have been applied.

train_df[['Name','Title']].head()

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


### Step 2

In this step, we will combine the SibSp column and the Parch column, we do this because it will make it easier for the model to learn the pattern, that is it will be easy to learn from one column rather than 2 different columns.

In [14]:
def add_family_features(df):
    
    # We now combine the SibSp and Parch columns to FamilySize and add 1 to it for the passenger themselves.
    df['FamilySize'] = df['SibSp'].fillna(0) + df['Parch'].fillna(0) + 1

    # If the FamilySize column turns out to be 1, that means the passenger is travelling alone.
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    return df

In [15]:
# Applying the function to the dataframes.

train_df = add_family_features(train_df)
test_df  = add_family_features(test_df)

In [16]:
# Checking if the column is created or not

train_df[['SibSp','Parch','FamilySize','IsAlone']].head()

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,1,0,2,0
1,1,0,2,0
2,0,0,1,1
3,1,0,2,0
4,0,0,1,1


### Step 3

Now we will take care of the missing values in the Age column and replace them with the median value of the Age column.

In [17]:
for df in (train_df, test_df):
    df['Age'].fillna(df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [18]:
# Checking if there are no missing values after replacing the values

print("Missing Age in train:", train_df['Age'].isna().sum())
print("Missing Age in test :", test_df['Age'].isna().sum())

Missing Age in train: 0
Missing Age in test : 0


### Step 4

Imputing the Embarked and Fare values with the most commonly appeared value in that column.

In [19]:
for df in (train_df, test_df):
    # Embarked column is now filled with the mode
    if df['Embarked'].isna().sum() > 0:
        most_common_port = df['Embarked'].mode()[0]
        df['Embarked'].fillna(most_common_port, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(most_common_port, inplace=True)


In [20]:
# For the missing values in the Fares column, we compute those values with the median of all the values

test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)


In [21]:
# Checking if there are no missing values after replacing the values

print("Missing Embarked in train:", train_df['Embarked'].isna().sum())
print("Missing Embarked in test :", test_df['Embarked'].isna().sum())
print("Missing Fare in test     :", test_df['Fare'].isna().sum())

Missing Embarked in train: 0
Missing Embarked in test : 0
Missing Fare in test     : 0


### Step 5

Performing numerical encoding on the "Sex" "Embarked" and "Title" columns so that the model can learn these features in a better way.

In [22]:
for df in (train_df, test_df):
    # Sex: male=0, female=1
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    # Embarked: map to integers
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # Title: map to integers (Mr=1, Miss=2, Mrs=3, Master=4, Special=5)
    df['Title'] = df['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Special': 5}).astype(int)

In [23]:
# Checking if the values have been calculated propely or not

print(train_df[['Sex','Embarked','Title']].head())

   Sex  Embarked  Title
0    0         0      1
1    1         1      3
2    1         0      2
3    1         0      3
4    0         0      1


### Step 6

Removing the unnecessary columns from the dataset

In [24]:
# Dropping 'PassengerId', 'Name', 'Ticket' and 'Cabin' as these columns have string values or values which are not important for our prediction

train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [25]:
# We will save the PassengerIds for submission.

test_passenger_ids = test_df['PassengerId'] 

In [26]:
# Dropping the same values from test dataset as well

test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [27]:
# Checking if the columns were dropped

print("Train columns:", train_df.columns.tolist())
print("Test columns :", test_df.columns.tolist())

Train columns: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
Test columns : ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone']


### Step 7

Now the final step remaining is to Train the models and compare accuracies with the already implemented models

In [28]:
# Splitting the target and feature columns

X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]

In [29]:
# Preparing the data for Linear Metric models like SVM, Logistic Regression, Linear Regression and Perceptron 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [30]:
# Differentiating the Numerical and Catergorical Features

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone']

In [31]:
# Applying the standard scaler on only the numeric features, this will help the model to recognize patterns in a better way

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [32]:
# Applying the one-hot encoding technique to the categorical features 

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [33]:
# Combining the two pipelines

preprocess_linear = ColumnTransformer([
    ('num', numeric_pipe, numeric_features),
    ('cat', categorical_pipe, categorical_features)
], remainder='drop')

In [34]:
# Importing the libraries needed to train the models

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [35]:
# Training the models

models = {
    "SVM": Pipeline([
        ('prep', preprocess_linear),
        ('model', SVC(kernel='rbf', C=1.0, gamma='scale',
                      class_weight='balanced', random_state=42))
    ]),
    "Perceptron (tuned)": Pipeline([
        ('prep', preprocess_linear),
        # ← EXACT config that gave you ~80% (no class_weight)
        ('model', Perceptron(max_iter=2000, eta0=0.1, penalty='l2', random_state=42))
    ]),
    "Logistic Regression": Pipeline([
        ('prep', preprocess_linear),
        ('model', LogisticRegression(max_iter=1000, class_weight='balanced',
                                     random_state=42))
    ]),
    "Linear SVC": Pipeline([
        ('prep', preprocess_linear),
        ('model', LinearSVC(max_iter=2000, class_weight='balanced',
                            random_state=42))
    ]),
    "KNN": Pipeline([
        ('prep', preprocess_linear),
        ('model', KNeighborsClassifier(n_neighbors=7))
    ]),
    "SGD (tuned)": Pipeline([
    ('prep', preprocess_linear),
    ('model', SGDClassifier(
        loss='modified_huber',
        penalty='elasticnet',
        alpha=0.0005,
        l1_ratio=0.15,
        learning_rate='adaptive',
        eta0=0.1,
        max_iter=4000,
        tol=1e-4,
        random_state=42
    ))
]),

    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}

### Step 8

Reporting the accuracies of each and every model

In [36]:
results = {}
for name, clf in models.items():
    if isinstance(clf, Pipeline):
        clf.fit(X_train, y_train)
        acc = clf.score(X_train, y_train) * 100
    else:
        clf.fit(X_train, y_train)
        acc = clf.score(X_train, y_train) * 100
    results[name] = acc
    print(f"{name}: {acc:.2f}%")

SVM: 84.06%
Perceptron (tuned): 80.47%
Logistic Regression: 82.27%
Linear SVC: 82.38%
KNN: 85.52%
SGD (tuned): 83.05%
Random Forest: 98.20%
Decision Tree: 98.20%
Naive Bayes: 80.36%


In [37]:
sorted_results = dict(sorted(results.items(), key=lambda kv: kv[1], reverse=True))
print("\nSorted Results (%):")
for k, v in sorted_results.items():
    print(f"{k:>20}: {v:5.2f}%")


Sorted Results (%):
       Random Forest: 98.20%
       Decision Tree: 98.20%
                 KNN: 85.52%
                 SVM: 84.06%
         SGD (tuned): 83.05%
          Linear SVC: 82.38%
 Logistic Regression: 82.27%
  Perceptron (tuned): 80.47%
         Naive Bayes: 80.36%
