In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [54]:
train_data = pd.read_csv(r"./data/train.csv")
print(train_data.info())
train_data.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
555,556,0,1,"Wright, Mr. George",male,62.0,0,0,113807,26.55,,S
701,702,1,1,"Silverthorne, Mr. Spencer Victor",male,35.0,0,0,PC 17475,26.2875,E24,S


In [114]:
def preprocess_data(subset):
    mean_imp = SimpleImputer(strategy='mean')
    
    age = np.array(subset["Age"]).reshape(-1, 1)
    age = mean_imp.fit_transform(age)

    Pclass = np.array(subset["Pclass"]).reshape(-1,1)

    sex = np.array(subset["Sex"]).reshape(-1,1)
    sex = (sex != "male").astype(int)

    SibSp = np.array(subset["SibSp"]).reshape(-1,1)
    
    embarked = np.array(subset["Embarked"])
    embarked = LabelEncoder().fit_transform(embarked).reshape(-1,1)

    features = np.concatenate((age, Pclass, sex, SibSp, embarked), axis=1)
    return features

In [115]:
subset = train_data[["Survived", "Age", "Pclass", "Sex", "SibSp", "Embarked"]]

y = np.array(subset["Survived"])

features = preprocess_data(subset)
print(features)


[[22.          3.          0.          1.          2.        ]
 [38.          1.          1.          1.          0.        ]
 [26.          3.          1.          0.          2.        ]
 ...
 [29.69911765  3.          1.          1.          2.        ]
 [26.          1.          0.          0.          0.        ]
 [32.          3.          0.          0.          1.        ]]


In [116]:
def logistic_regression_predictions(test_features):
    clf = LogisticRegression(random_state=0)
    clf_pred = clf.fit(features, y)
    score = clf_pred.score(features, y)
    predictions = clf_pred.predict(test_features)
    return score, predictions.reshape(-1,1)

In [117]:
def svm_prediction(test_features):
    svm_pred = svm.SVC(kernel='linear')
    svm_pred.fit(features, y)
    score = svm_pred.score(features,y)
    predictions = svm_pred.predict(test_features)
    return score, predictions.reshape(-1,1)

In [118]:
def naive_bayes_prediction(test_features):
    gnb = GaussianNB()
    gnb_pred = gnb.fit(features, y)
    score = gnb_pred.score(features,y)
    predictions = gnb_pred.predict(test_features)
    return score, predictions.reshape(-1,1)

In [119]:
def KNN_prediction(test_features):
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh_pred = neigh.fit(features,y)
    score = neigh_pred.score(features,y)
    predictions = neigh_pred.predict(test_features)
    return score, predictions.reshape(-1,1)

In [129]:
def random_forest_prediction(test_features):
    forest = RandomForestClassifier(max_depth=4)
    forest.fit(features,y)
    score = forest.score(features,y)
    predictions = forest.predict(test_features)
    return score, predictions.reshape(-1,1)

In [130]:
test_data = pd.read_csv(r"./data/test.csv")

test_subset = test_data[["Age", "Pclass", "Sex", "SibSp", "Embarked"]]

test_features = preprocess_data(test_subset)

In [131]:
score, predictions = random_forest_prediction(test_features)
print(f"Score on training data: {score}")
print(f"Number of predictions on test data: {len(predictions)}")

Score on training data: 0.8338945005611672
Number of predictions on test data: 418


In [132]:
pids = np.array(test_data["PassengerId"]).reshape(-1,1)
submission = np.concatenate((pids, predictions), axis=1)
submission = pd.DataFrame(submission, columns=['PassengerId', 'Survived'])
submission.to_csv('submission.csv',index=False)