In [56]:
# Import libraries
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [57]:
# DATA INGESTION

# Fetch the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [58]:
# Check the DataFrame
train_df.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [59]:
# PREPROCESSING

# Create a fake Survived column for test data
test_df.loc[:,"Survived"] = -1

In [60]:
# Concatenate both training and test data
data = pd.concat([train_df,test_df]).reset_index(drop=True)

In [61]:
# FEATURE ENGINEERING

# Drop non interested columns
data.drop(['PassengerId','Name','Cabin','Embarked','Ticket','Sex'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [62]:
# Impute missing values in the entire 'Age' & 'Fare' columns using median
imputer = SimpleImputer(strategy='median')
data['Age'] = imputer.fit_transform(data[['Age']])
data ['Fare'] = imputer.fit_transform(data[['Fare']]) # [[]] converts the Series into a DataFrame with one colum

In [63]:
# Convert 'Sex' and 'Embarked' to dummy variables (one-hot encoding)
#data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

In [64]:
# Now handle the male outliers (> 63 years) by filling them with the median age
#data.loc[(data['Sex_male']) & (data['Age'] > 63), 'Age'] = data['Age'].median()

In [65]:
data.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [66]:
# split the training and test data again
train_df = data[data.Survived != -1].reset_index(drop=True) 
test_df = data[data.Survived == -1].reset_index(drop=True)

In [67]:
# CROSS-VALIDATION
# We create a new column called kfold and fill it with -1
train_df['kfold'] = -1

# The next step is to randomize the rows of the data
train_df = train_df.sample(frac=1,random_state=32).reset_index(drop=True)

# Fetch the targets
y = train_df.Survived.values

# Inititate the kfold class
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold'] = f

In [68]:
train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,kfold
0,0,3,28.0,0,0,8.05,0
1,1,1,28.0,1,0,133.65,0
2,1,1,28.0,1,0,89.1042,0
3,0,3,28.0,0,0,7.7375,0
4,0,2,28.0,0,0,0.0,0


In [69]:
# Collect accuracies
lst = []

# Loop folds
for fold in range(0,5):
    # Training data is where kfold is not equal to provided fold
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    
    # Validation data is where kfold is equal to provided fold
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    # Drop the Survived and kfold column from dataframe and convert it to a numpy array
    x_train = df_train.drop(['Survived','kfold'],axis=1).values
    y_train = df_train.Survived.values

    # Similarly, for validation
    x_valid = df_valid.drop(['Survived','kfold'],axis=1).values
    y_valid = df_valid.Survived.values

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)

    # INITIALIZE THE MODEL & FINE-TUNING
    model = LogisticRegression(random_state=32)

    # Fit the model on training data
    model.fit(x_train,y_train)

    # Create predictions for validations samples
    preds = model.predict(x_valid)

    # Calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid,preds)
    print(f"Fold = {fold}, Accuracy = {accuracy}")

    lst.append(accuracy)

Fold = 0, Accuracy = 0.6927374301675978
Fold = 1, Accuracy = 0.7078651685393258
Fold = 2, Accuracy = 0.6573033707865169
Fold = 3, Accuracy = 0.7191011235955056
Fold = 4, Accuracy = 0.7247191011235955


In [70]:
Average = sum(lst) / len(lst) 
print(f"Average accuracy = {Average}")

Average accuracy = 0.7003452388425083


In [71]:
# Make predictions on the test data
test_predictions = model.predict(test_df.drop('Survived',axis=1).values)

In [72]:
# Prepare the submission file
submission = pd.read_csv('../data/submission.csv')
submission['Survived'] = test_predictions
submission.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [73]:
# Save to CSV
submission.to_csv('../data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
