In [1]:
#Load Train dataset
import pandas as pd
train_data = pd.read_csv("https://raw.githubusercontent.com/AtharvaKalsekar/Kaggle/master/titanic/train.csv")
print(train_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [0]:
print(train_data.dtypes)

In [2]:
train_data.drop(columns=["PassengerId","Name","Ticket","Cabin","Embarked"],inplace=True)
print(train_data.dtypes)

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object


In [3]:
#Check for missing values
train_data.isnull().any()

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
dtype: bool

In [0]:
# Get the numpy arrays for independent and target variables
import numpy as np
X = train_data.iloc[:,1:].values
y = train_data.iloc[:,0].values

In [5]:
print(X.shape)
print(y.shape)

(891, 6)
(891,)


In [0]:
# Label encoding the strings
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,1] = le.fit_transform(X[:,1])

In [7]:
#treat missing values for columns obtained from dataframe.isnull().any()
from fancyimpute import KNN
impt=KNN(k=5)
X=impt.fit_transform(X)

Using TensorFlow backend.


Imputing row 1/891 with 0 missing, elapsed time: 0.136
Imputing row 101/891 with 0 missing, elapsed time: 0.138
Imputing row 201/891 with 0 missing, elapsed time: 0.139
Imputing row 301/891 with 1 missing, elapsed time: 0.140
Imputing row 401/891 with 0 missing, elapsed time: 0.141
Imputing row 501/891 with 0 missing, elapsed time: 0.142
Imputing row 601/891 with 0 missing, elapsed time: 0.143
Imputing row 701/891 with 0 missing, elapsed time: 0.144
Imputing row 801/891 with 0 missing, elapsed time: 0.145


In [0]:
#scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [0]:
#Spliting train and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25, random_state=0, stratify=y)

In [10]:
print(X_train.shape)
print(y_train.shape)

(623, 6)
(623,)


In [0]:
#PCA
from sklearn.decomposition import KernelPCA
pca = KernelPCA(n_components=4, kernel="rbf")
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [42]:
print(X_train.shape)
print(X_val.shape)

(668, 4)
(223, 4)


In [0]:
#load model KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = "minkowski")

In [0]:
#load model naive bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [0]:
#load model random forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)

In [47]:
#fit model
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [50]:
#Evaluate model
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator = model, X = X_train, y = y_train, cv=10)
print(acc.mean())

0.7677089153164658


In [0]:
#Processing test data
test_data = pd.read_csv("https://raw.githubusercontent.com/AtharvaKalsekar/Kaggle/master/titanic/test.csv")
print(test_data.dtypes)

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [0]:
testset_ids = test_data[["PassengerId"]]
testset = test_data.drop(columns = ["PassengerId","Name","Ticket","Cabin","Embarked"])

In [0]:
testset.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [0]:
testset.describe

In [0]:
#check for missing values
testset.isnull().any()

Pclass    False
Sex       False
Age        True
SibSp     False
Parch     False
Fare       True
dtype: bool

In [0]:
#Get the numpy arrays for test set
testset = testset.iloc[:,:].values
print(testset.shape)

(418, 6)


In [0]:
#label encode test set
testset[:,1] = le.transform(testset[:,1])

In [0]:
#treat missing values
testset = impt.fit_transform(testset)

In [0]:
#feature scaling for test set
testset = sc.transform(testset)

In [0]:
#PCA for test set
testset = pca.transform(testset)

In [0]:
#predict
testset_pred = model.predict(testset)

In [0]:
#storing the result
result_df=pd.DataFrame(testset_pred)
result_df=pd.concat([testset_ids,result_df], axis=1,ignore_index=True)
print(result_df)

In [0]:
result_df.to_csv('out.csv')