In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Settings
import sys
sys.path.append('../src')
# Import feature engineering functions
from preprocessing import preprocess_data, drop_columns

In [2]:
# DATA INGESTION

# Fetch the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
# PREPROCESSING

# Create a fake Survived column for test data
test_df.loc[:,"Survived"] = -1

In [4]:
# Concatenate both training and test data
data = pd.concat([train_df,test_df]).reset_index(drop=True)

In [5]:
# Check the DataFrame
data.tail(10)

In [6]:
# FEATURE ENGINEERING
data = preprocess_data(data)

In [7]:
data

In [8]:
data.isnull().sum()

In [9]:
data = drop_columns(data)

In [None]:

# Drop the columns
#data.drop(['Embarked', 'SibSp', 'Parch'], axis=1, inplace=True) #,'SibSp','Parch'

In [None]:

# Convert categorical variables or the ones that represent such ('Pclass') to dummy/numerical variables (one-hot encoding)
#data = pd.get_dummies(data, columns=['Sex','Pclass'], dtype=int, drop_first=True) # drop_first to avoid collinearity #'Embarked', 'Sex', 'AgeGroup','Pclass'

In [10]:
# split the training and test data again
train_df = data[data.Survived != -1].reset_index(drop=True) 
test_df = data[data.Survived == -1].reset_index(drop=True)

In [11]:
test_df.head(5)

In [12]:
train_df.head(5)

In [13]:
# CROSS-VALIDATION
# We create a new column called kfold and fill it with -1
train_df['kfold'] = -1

# The next step is to randomize the rows of the data
train_df = train_df.sample(frac=1,random_state=32).reset_index(drop=True)

# Fetch the targets
y = train_df.Survived.values

# Inititate the kfold class
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold'] = f

In [14]:
train_df.head()

In [15]:
# Collect accuracies
lst = []

# Loop folds
for fold in range(0,5):
    # Training data is where kfold is not equal to provided fold
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    
    # Validation data is where kfold is equal to provided fold
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    # Drop the Survived and kfold column from dataframe and convert it to a numpy array
    x_train = df_train.drop(['Survived','kfold'],axis=1).values
    y_train = df_train.Survived.values

    # Similarly, for validation
    x_valid = df_valid.drop(['Survived','kfold'],axis=1).values
    y_valid = df_valid.Survived.values

    scaler = StandardScaler() # Solved ConvergenceWarning
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)

    # INITIALIZE THE MODEL & FINE-TUNING
    model = LogisticRegression(max_iter=1000, random_state=32)
    # Fit the model on training data
    model.fit(x_train,y_train)

    # Create predictions for validations samples
    preds = model.predict(x_valid)

    # Calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid,preds)
    print(f"Fold = {fold}, Accuracy = {accuracy}")

    lst.append(accuracy)



In [16]:
Average = sum(lst) / len(lst) 
print(f"Average accuracy = {Average}")



In [17]:
# Make predictions on the test data
#test_predictions = model.predict(test_df.drop('Survived',axis=1).values)

In [18]:
# Make predictions on the test data using predict_proba to get probabilities
test_probabilities = model.predict_proba(test_df.drop('Survived', axis=1).values)[:, 1]

# Set a custom threshold (e.g., 0.3) to predict class labels instead of default 0.5
custom_threshold = 0.5 # lower threshold - capturing more survivors; higher threshold - stricter about predicting survival.
test_predictions = (test_probabilities >= custom_threshold).astype(int)

In [19]:
# Prepare the submission file
submission = pd.read_csv('../data/submission.csv')
submission['Survived'] = test_predictions
submission.tail(60)

In [20]:
# Save to CSV
submission.to_csv('../data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

