In [365]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [366]:
# DATA INGESTION

# Fetch the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [367]:
# Check the DataFrame
train_df.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [368]:
# PREPROCESSING

# Create a fake Survived column for test data
test_df.loc[:,"Survived"] = -1

In [369]:
# Concatenate both training and test data
data = pd.concat([train_df,test_df]).reset_index(drop=True)

In [370]:
# FEATURE ENGINEERING

# Drop non interested columns
data.drop(['PassengerId','Name','Cabin','Ticket'], axis=1, inplace=True)
data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [371]:
# Impute missing values in the entire 'Age' & 'Fare' columns using median
data['Age']=data['Age'].fillna(data['Age'].median(skipna=True)) #skipna=True to exclude NaN values when computing the result
data['Fare']=data['Fare'].fillna(data['Fare'].mean(skipna=True))
# Fill in missing values with the most common embarkation value - 'S' * Parch
data["Embarked"].fillna(data['Embarked'].value_counts().idxmax(), inplace=True)
data["Parch"].fillna(data['Parch'].value_counts().idxmax(), inplace=True)
# Bin Age groups
bins = [0, 3, 12, 18, 35, 55, 65, float('inf')]
labels = ['Infant/Toddler', 'Child', 'Teenager/Adolescent', 'Young Adult', 'Middle-Aged Adult', 'Older Adult', 'Senior/Elderly']
# Create a new column 'AgeGroup' with the binned age categories
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, include_lowest=True)
# Drop the columns
data.drop(['Age'], axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Embarked"].fillna(data['Embarked'].value_counts().idxmax(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Parch"].fillna(data['Parch'].value_counts().idxmax(), inplace=True)


In [372]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
AgeGroup    0
dtype: int64

In [373]:
#

In [374]:
# Bin SibSp and Parch to group the data 
#data['SibSp'] = pd.cut(data['SibSp'], bins=[-1, 0, 1, 3, 8], labels=['0', '1', '2', '3'])
#data['Parch'] = pd.cut(data['Parch'], bins=[-1, 0, 1, 3, 6], labels=['0', '1', '2', '3'])

In [375]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
AgeGroup    0
dtype: int64

In [376]:
# Join two features to simplify modelling
data['TravelAlone']=np.where((data["SibSp"]+data["Parch"])>0, 0, 1)
# First, bin the fare into logical groups if you haven't already
fare_bins = [0, 10, 50, 100, 250, 600]
fare_labels = ['Low', 'Medium', 'High', 'Very High', 'Luxury']
data['FareGroup'] = pd.cut(data['Fare'], bins=fare_bins, labels=fare_labels)
data.head()
# Combine Pclass and FareGroup into a single categorical feature
data['Socioeconomic'] = data['Pclass'].astype(str) + '_' + data['FareGroup'].astype(str)
# Drop the columns
data.drop(['SibSp','Parch','Fare','FareGroup','Pclass'], axis=1, inplace=True)

# Convert categorical variables or the ones that represent such ('Pclass') to dummy/numerical variables (one-hot encoding)
data = pd.get_dummies(data, columns=['Embarked', 'Sex', 'AgeGroup', 'Socioeconomic'], dtype=int, drop_first=True) # drop_first to avoid collinearity

In [377]:
# Now handle the male outliers (> 63 years) by filling them with the median age
#data.loc[(data['Sex_male']) & (data['Age'] > 63), 'Age'] = data['Age'].median()

In [378]:
data.head(40)

Unnamed: 0,Survived,TravelAlone,Embarked_Q,Embarked_S,Sex_male,AgeGroup_Child,AgeGroup_Teenager/Adolescent,AgeGroup_Young Adult,AgeGroup_Middle-Aged Adult,AgeGroup_Older Adult,...,Socioeconomic_1_Very High,Socioeconomic_1_nan,Socioeconomic_2_High,Socioeconomic_2_Low,Socioeconomic_2_Medium,Socioeconomic_2_nan,Socioeconomic_3_High,Socioeconomic_3_Low,Socioeconomic_3_Medium,Socioeconomic_3_nan
0,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [379]:
# split the training and test data again
train_df = data[data.Survived != -1].reset_index(drop=True) 
test_df = data[data.Survived == -1].reset_index(drop=True)

In [380]:
# CROSS-VALIDATION
# We create a new column called kfold and fill it with -1
train_df['kfold'] = -1

# The next step is to randomize the rows of the data
train_df = train_df.sample(frac=1,random_state=32).reset_index(drop=True)

# Fetch the targets
y = train_df.Survived.values

# Inititate the kfold class
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold'] = f

In [381]:
train_df.head()

Unnamed: 0,Survived,TravelAlone,Embarked_Q,Embarked_S,Sex_male,AgeGroup_Child,AgeGroup_Teenager/Adolescent,AgeGroup_Young Adult,AgeGroup_Middle-Aged Adult,AgeGroup_Older Adult,...,Socioeconomic_1_nan,Socioeconomic_2_High,Socioeconomic_2_Low,Socioeconomic_2_Medium,Socioeconomic_2_nan,Socioeconomic_3_High,Socioeconomic_3_Low,Socioeconomic_3_Medium,Socioeconomic_3_nan,kfold
0,0,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [382]:
# Collect accuracies
lst = []

# Loop folds
for fold in range(0,5):
    # Training data is where kfold is not equal to provided fold
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    
    # Validation data is where kfold is equal to provided fold
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    # Drop the Survived and kfold column from dataframe and convert it to a numpy array
    x_train = df_train.drop(['Survived','kfold'],axis=1).values
    y_train = df_train.Survived.values

    # Similarly, for validation
    x_valid = df_valid.drop(['Survived','kfold'],axis=1).values
    y_valid = df_valid.Survived.values

    scaler = StandardScaler() # Solved ConvergenceWarning
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)

    # INITIALIZE THE MODEL & FINE-TUNING
    model = LogisticRegression(random_state=32)

    # Fit the model on training data
    model.fit(x_train,y_train)

    # Create predictions for validations samples
    preds = model.predict(x_valid)

    # Calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid,preds)
    print(f"Fold = {fold}, Accuracy = {accuracy}")

    lst.append(accuracy)

Fold = 0, Accuracy = 0.8044692737430168
Fold = 1, Accuracy = 0.8146067415730337
Fold = 2, Accuracy = 0.8146067415730337
Fold = 3, Accuracy = 0.797752808988764
Fold = 4, Accuracy = 0.797752808988764


In [383]:
Average = sum(lst) / len(lst) 
print(f"Average accuracy = {Average}")

Average accuracy = 0.8058376749733224


In [384]:
# Make predictions on the test data
test_predictions = model.predict(test_df.drop('Survived',axis=1).values)

In [385]:
# Prepare the submission file
submission = pd.read_csv('../data/submission.csv')
submission['Survived'] = test_predictions
submission.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [386]:
# Save to CSV
submission.to_csv('../data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
