In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [25]:
# load data
df = pd.read_csv("titanic_data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


So its a binary classification problem

In [26]:
print(df.shape)
df.info()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


The data for cabin is scant. Name directly is not of any use either

In [27]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [59]:
# isolate training and target data

X = df.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
y = df['Survived']
print(X.shape, y.shape)

(891, 7) (891,)


In [60]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [61]:
X.nunique()

Pclass        3
Sex           2
Age          88
SibSp         7
Parch         7
Fare        248
Embarked      3
dtype: int64

### Preprocessing

- Age has some null entries, we can replace all null entries by the median. 
- 'Embarked' and 'Sex' are categorical columns which need to be transformed through one hot encoding. 
- Finally, standard scaling of data and it will be ready for training. 


In [96]:
X_cat = X[["Embarked", "Sex"]]
X_num = X.drop(["Embarked", "Sex"], axis = 1)
X_cat.shape, X_num.shape

((891, 2), (891, 5))

In [97]:
imputer = SimpleImputer(strategy='median')

imputer.fit(X_num)

In [98]:
imputer.statistics_

array([ 3.    , 28.    ,  0.    ,  0.    , 14.4542])

In [99]:
X_num_transformed = imputer.transform(X_num)

In [108]:
type(X_num), type(X_num_transformed)

(pandas.core.frame.DataFrame, numpy.ndarray)

So categorical columns have been sorted out and stored into __an array__: `X_num` 

In [101]:
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat_transformed = cat_imputer.fit_transform(X_cat)

In [103]:
type(X_cat)

pandas.core.frame.DataFrame

In [93]:
cat_imputer.statistics_

array(['S', 'male'], dtype=object)

In [105]:
cat_encoder = OneHotEncoder()
X_cat1hot = cat_encoder.fit_transform(X_cat_transformed)

In [106]:
X_cat1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1782 stored elements and shape (891, 5)>

In [107]:
cat_encoder.categories_

[array(['C', 'Q', 'S'], dtype=object), array(['female', 'male'], dtype=object)]

Behenchod column name nikalne me kitna struggle hai bhai; sparse array, dense array, dataframe fir concatinate karo mkccc. 

In [111]:
from scipy.sparse import csr_matrix

if isinstance(X_cat1hot, csr_matrix):  # Check if it's sparse
    X_cat1hot = X_cat1hot.toarray()  # Convert to dense array

# Convert X_cat1hot to DataFrame (assuming you have feature names from OneHotEncoder)
X_cat_T = pd.DataFrame(X_cat1hot, columns=['C', 'Q', 'S','female', 'male'])

# Convert X_num to DataFrame if it's still a NumPy array
if isinstance(X_num_transformed, np.ndarray):
    X_num_T = pd.DataFrame(X_num_transformed, columns=X_num.columns)  # Ensure num_columns contains correct feature names

# Concatenate numerical and categorical data
XT = pd.concat([X_num_T, X_cat_T], axis=1)

# Check final shape
XT.shape

(891, 10)

Ok the battle has kinda begun _now_

In [114]:
scaler = StandardScaler()

XT = scaler.fit_transform(XT)
type(XT)

numpy.ndarray

In [121]:
dim = [XT.shape, y.shape]

dim

[(891, 10), (891,)]

<span style="color:#FF0000; font-family: 'Bebas Neue'; font-size: 01em;">NOTE TO SELF:</span>

Work with numpy arrays and finally once and for all convert to DF while concatinating. If working with pipelines, naturally its much easier as such small things take care of themselves. 

For historical reason (numpy predates pandas) outputs of scaler, imputer, what have you is a numpy array. Also, 

If you're building real-world ML models, you'll often end up using NumPy anyway, because:

- Many ML libraries (XGBoost, TensorFlow, PyTorch) work better with NumPy arrays.

- NumPy operations are faster than Pandas for large datasets.

- Sparse matrices (used in OneHotEncoding) are usually stored as NumPy/scipy objects.

In [126]:
# select baseline model as SGD classifier. 

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, confusion_matrix

In [123]:
sgd_clf = SGDClassifier()

y_sgd_predict = cross_val_predict(sgd_clf, XT, y, cv=5)

In [140]:
sgd_score = cross_val_score(sgd_clf, XT, y, cv = 5, scoring='f1')
sgd_score

array([0.65306122, 0.70866142, 0.74647887, 0.67669173, 0.71428571])

In [141]:
confx_matrix = confusion_matrix(y, y_sgd_predict)
confx_matrix

array([[433, 116],
       [119, 223]], dtype=int64)

wow the performance is remarkably poor. 

In [137]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=4, weights='distance')

In [138]:
y_knn_pred = cross_val_predict(knn_clf, XT, y, cv=5)

In [None]:
knn_score1 = cross_val_score(knn_clf, XT, y, cv = 5, scoring='accuracy')
knn_score1

array([0.77094972, 0.76404494, 0.82022472, 0.78089888, 0.78089888])

In [149]:
knn_score2 = cross_val_score(knn_clf, XT, y, cv = 5, scoring='f1')
knn_score2

array([0.70503597, 0.68656716, 0.76470588, 0.69291339, 0.71532847])

In [143]:
conf_matx2 = confusion_matrix(y, y_knn_pred)
conf_matx2

array([[458,  91],
       [102, 240]], dtype=int64)

KNN is marginally better than SGD. 

So far: 

- We have discarded ['PassengerId', 'Name', 'Cabin', 'Ticket'] columns. PassengerId makes sense, since it is randomly assigned. 
Cabin data is available only for 240 odd passengers. Name and ticket, ideally should _not_ have any contribution; making any deductions from these strings is quite difficult anyway. 
- unless we can extract the ticket number explicity while stripping A5 or OL or any such strings. 