In [9]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Settings
import sys
sys.path.append('../src')
# Import feature engineering functions
from preprocessing import preprocess_data, drop_columns

In [10]:
# DATA INGESTION

# Fetch the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [11]:
# PREPROCESSING

# Create a fake Survived column for test data
test_df.loc[:,"Survived"] = -1

In [12]:
# Concatenate both training and test data
data = pd.concat([train_df,test_df]).reset_index(drop=True)

In [13]:
# Check the DataFrame
data.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1299,1300,-1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1300,1301,-1,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S
1301,1302,-1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
1302,1303,-1,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0,C78,Q
1303,1304,-1,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.775,,S
1304,1305,-1,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,-1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,-1,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,-1,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,-1,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [14]:
# FEATURE ENGINEERING
data = preprocess_data(data)

In [15]:
data

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_S,FamilySize,Sex_male,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,1,2,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0,2,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,1,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,1,2,0,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,-1,"Spector, Mr. Woolf",28.0,0,0,A.5. 3236,8.0500,,S,1,1,1,0,1
1305,1306,-1,"Oliva y Ocana, Dona. Fermina",39.0,0,0,PC 17758,108.9000,C105,C,0,1,0,0,0
1306,1307,-1,"Saether, Mr. Simon Sivertsen",38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1,1,1,0,1
1307,1308,-1,"Ware, Mr. Frederick",28.0,0,0,359309,8.0500,,S,1,1,1,0,1


In [16]:
data.isnull().sum()

PassengerId       0
Survived          0
Name              0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
Embarked_S        0
FamilySize        0
Sex_male          0
Pclass_2          0
Pclass_3          0
dtype: int64

In [17]:
data = drop_columns(data)

In [18]:
data

Unnamed: 0,PassengerId,Survived,Age,Fare,Embarked_S,FamilySize,Sex_male,Pclass_2,Pclass_3
0,1,0,22.0,7.2500,1,2,1,0,1
1,2,1,38.0,71.2833,0,2,0,0,0
2,3,1,26.0,7.9250,1,1,0,0,1
3,4,1,35.0,53.1000,1,2,0,0,0
4,5,0,35.0,8.0500,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...
1304,1305,-1,28.0,8.0500,1,1,1,0,1
1305,1306,-1,39.0,108.9000,0,1,0,0,0
1306,1307,-1,38.5,7.2500,1,1,1,0,1
1307,1308,-1,28.0,8.0500,1,1,1,0,1


In [19]:

# Drop the columns
#data.drop(['Embarked', 'SibSp', 'Parch'], axis=1, inplace=True) #,'SibSp','Parch'

In [20]:

# Convert categorical variables or the ones that represent such ('Pclass') to dummy/numerical variables (one-hot encoding)
#data = pd.get_dummies(data, columns=['Sex','Pclass'], dtype=int, drop_first=True) # drop_first to avoid collinearity #'Embarked', 'Sex', 'AgeGroup','Pclass'

In [21]:
# split the training and test data again
train_df = data[data.Survived != -1].reset_index(drop=True) 
test_df = data[data.Survived == -1].reset_index(drop=True)

In [22]:
test_df.head(5)

Unnamed: 0,PassengerId,Survived,Age,Fare,Embarked_S,FamilySize,Sex_male,Pclass_2,Pclass_3
0,892,-1,34.5,7.8292,0,1,1,0,1
1,893,-1,47.0,7.0,1,2,0,0,1
2,894,-1,62.0,9.6875,0,1,1,1,0
3,895,-1,27.0,8.6625,1,1,1,0,1
4,896,-1,22.0,12.2875,1,3,0,0,1


In [23]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Age,Fare,Embarked_S,FamilySize,Sex_male,Pclass_2,Pclass_3
0,1,0,22.0,7.25,1,2,1,0,1
1,2,1,38.0,71.2833,0,2,0,0,0
2,3,1,26.0,7.925,1,1,0,0,1
3,4,1,35.0,53.1,1,2,0,0,0
4,5,0,35.0,8.05,1,1,1,0,1


In [24]:
# CROSS-VALIDATION
# We create a new column called kfold and fill it with -1
train_df['kfold'] = -1

# The next step is to randomize the rows of the data
train_df = train_df.sample(frac=1,random_state=32).reset_index(drop=True)

# Fetch the targets
y = train_df.Survived.values

# Inititate the kfold class
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold'] = f

In [25]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Age,Fare,Embarked_S,FamilySize,Sex_male,Pclass_2,Pclass_3,kfold
0,838,0,28.0,8.05,1,1,1,0,1,0
1,335,1,28.0,133.65,1,2,0,0,0,0
2,850,1,28.0,89.1042,0,2,0,0,0,0
3,779,0,28.0,7.7375,0,1,1,0,1,0
4,414,0,28.0,0.0,1,1,1,1,0,0


In [26]:
# Collect accuracies
lst = []

# Loop folds
for fold in range(0,5):
    # Training data is where kfold is not equal to provided fold
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    
    # Validation data is where kfold is equal to provided fold
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    # Drop the Survived and kfold column from dataframe and convert it to a numpy array
    x_train = df_train.drop(['Survived','kfold'],axis=1).values
    y_train = df_train.Survived.values

    # Similarly, for validation
    x_valid = df_valid.drop(['Survived','kfold'],axis=1).values
    y_valid = df_valid.Survived.values

    scaler = StandardScaler() # Solved ConvergenceWarning
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)

    # INITIALIZE THE MODEL & FINE-TUNING
    model = LogisticRegression(max_iter=1000, random_state=32)
    # Fit the model on training data
    model.fit(x_train,y_train)

    # Create predictions for validations samples
    preds = model.predict(x_valid)

    # Calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid,preds)
    print(f"Fold = {fold}, Accuracy = {accuracy}")

    lst.append(accuracy)

Fold = 0, Accuracy = 0.776536312849162
Fold = 1, Accuracy = 0.7921348314606742
Fold = 2, Accuracy = 0.8033707865168539
Fold = 3, Accuracy = 0.8089887640449438
Fold = 4, Accuracy = 0.7865168539325843


In [27]:
Average = sum(lst) / len(lst) 
print(f"Average accuracy = {Average}")

Average accuracy = 0.7935095097608437


In [28]:
# Make predictions on the test data
#test_predictions = model.predict(test_df.drop('Survived',axis=1).values)

In [29]:
# Make predictions on the test data using predict_proba to get probabilities
test_probabilities = model.predict_proba(test_df.drop('Survived', axis=1).values)[:, 1]

# Set a custom threshold (e.g., 0.3) to predict class labels instead of default 0.5
custom_threshold = 0.5 # lower threshold - capturing more survivors; higher threshold - stricter about predicting survival.
test_predictions = (test_probabilities >= custom_threshold).astype(int)

In [30]:
# Prepare the submission file
submission = pd.read_csv('../data/submission.csv')
submission['Survived'] = test_predictions
submission.head(60)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [31]:
# Save to CSV
#submission.to_csv('../data/submission.csv', index=False)
#print("Submission file saved as 'submission.csv'.")