In [150]:
# Import libraries
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

%matplotlib inline

In [151]:
# DATA INGESTION

# Fetch the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [152]:
# Check the DataFrame
train_df.tail(50)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
841,842,0,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5,,S
842,843,1,1,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0,,C
843,844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C
844,845,0,3,"Culumovic, Mr. Jeso",male,17.0,0,0,315090,8.6625,,S
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C
848,849,0,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,S
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S


In [153]:
# PREPROCESSING

# Create a fake Survived column for test data
test_df.loc[:,"Survived"] = -1

In [154]:
# Concatenate both training and test data
data = pd.concat([train_df,test_df]).reset_index(drop=True)

In [155]:
# FEATURE ENGINEERING

# Drop non interested columns
data.drop(['PassengerId','Name','Cabin','Embarked','Sex','Ticket','Age','Fare'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch
0,0,3,1,0
1,1,1,1,0
2,1,3,0,0
3,1,1,1,0
4,0,3,0,0


In [156]:
# split the training and test data again
train_df = data[data.Survived != -1].reset_index(drop=True) 
test_df = data[data.Survived == -1].reset_index(drop=True)

In [157]:
# CROSS-VALIDATION
# We create a new column called kfold and fill it with -1
train_df['kfold'] = -1

# The next step is to randomize the rows of the data
train_df = train_df.sample(frac=1,random_state=32).reset_index(drop=True)

# Fetch the targets
y = train_df.Survived.values

# Inititate the kfold class
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=train_df,y=y)):
    train_df.loc[v_,'kfold'] = f

In [158]:
train_df.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,kfold
0,0,3,0,0,0
1,1,1,1,0,0
2,1,1,1,0,0
3,0,3,0,0,0
4,0,2,0,0,0


In [159]:
# Collect accuracies
lst = []

# Loop folds
for fold in range(0,5):
    # Training data is where kfold is not equal to provided fold
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    
    # Validation data is where kfold is equal to provided fold
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    # Drop the Survived column from dataframe and convert it to a numpy array
    x_train = df_train.drop('Survived',axis=1).values
    y_train = df_train.Survived.values

    # Similarly, for validation we have
    x_valid = df_valid.drop('Survived',axis=1).values
    y_valid = df_valid.Survived.values

    # INITIALIZE THE MODEL & FINE-TUNNING
    model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', class_weight='balanced',random_state=32)

    # Fit the model on training data
    model.fit(x_train,y_train)

    # Create predictions for validations samples
    preds = model.predict(x_valid)

    # Calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid,preds)
    print(f"Fold = {fold}, Accuracy = {accuracy}")

    lst.append(accuracy)

Fold = 0, Accuracy = 0.664804469273743
Fold = 1, Accuracy = 0.6685393258426966
Fold = 2, Accuracy = 0.6460674157303371
Fold = 3, Accuracy = 0.6797752808988764
Fold = 4, Accuracy = 0.6404494382022472


In [160]:
Average = sum(lst) / len(lst) 
print(f"Average accuracy = {Average}")

Average accuracy = 0.6599271859895801


In [161]:
# Make predictions on the test data
test_predictions = model.predict(test_df.values)

In [162]:
# Prepare the submission file
submission = pd.read_csv('../data/submission.csv')
submission['Survived'] = test_predictions
submission.Survived.value_counts()

Survived
1    418
Name: count, dtype: int64

In [163]:
# Save to CSV
submission.to_csv('../data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
