In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [2]:
## read the train and test csv to dataframe using pandas.
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### Using Sklearn LabelEncoder() to transform the column

In [3]:
## apply One hot encoding to Sex column. 
le = LabelEncoder()
train_df_le = train_df
train_df_le.Sex = le.fit_transform(train_df_le.Sex)

In [4]:
## dataset describe
train_df_le.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,1.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


### Filling NaN data with mean() for the column Age

In [5]:
## fill NaN with mean
train_df_le.Age.fillna(train_df_le.Age.mean(),inplace=True)

### Split the data in to train and test sets with test_size of 0.2

In [6]:
## spliting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df_le.drop(['Survived','Name','Ticket','Fare','Cabin','Embarked'], axis="columns"), train_df_le['Survived'], test_size=0.2, random_state=0)



### Import model for training the dataset
Here the models which are used are imported from the sklearn python package. By using the score of the three model we can suggest which model is best fit.

In [7]:
## model fit and predict function which return the score of the model
def model_score(model, X_train=None, X_test=None, y_train=None, y_test=None):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model.score(X_test,y_test), y_pred

In [8]:
## decision Tree classification model
from sklearn.tree import DecisionTreeClassifier
model_dtc = DecisionTreeClassifier()
model_score_dtc, y_pred_dtc = model_score(model_dtc, X_train, X_test, y_train, y_test)
print("Decision Tree Model Score: " + str(model_score_dtc))
print("Precision: " + str(precision_score(y_test, y_pred_dtc)))
print("Accuracy: " + str(accuracy_score(y_test, y_pred_dtc)))

Decision Tree Model Score: 0.7932960893854749
Precision: 0.7666666666666667
Accuracy: 0.7932960893854749


In [9]:
## support vector machine model
from sklearn.svm import SVC
model_svc = SVC()
model_score_svm, y_pred_svm = model_score(model_svc, X_train, X_test, y_train, y_test)
print("SVM Model Score: " + str(model_score_svm))
print("Precision: " + str(precision_score(y_test, y_pred_svm)))
print("Accuracy: " + str(accuracy_score(y_test, y_pred_svm)))

SVM Model Score: 0.6145251396648045
Precision: 0.0
Accuracy: 0.6145251396648045


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
## random forest classification model
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(n_estimators=150)
model_score_rfc, y_pred_rfc = model_score(model_rfc, X_train, X_test, y_train, y_test)
print("RandomForest Model Score: " + str(model_score_rfc))
print("Precision: " + str(precision_score(y_test, y_pred_rfc)))
print("Accuracy: " + str(accuracy_score(y_test, y_pred_rfc)))

RandomForest Model Score: 0.8324022346368715
Precision: 0.819672131147541
Accuracy: 0.8324022346368715


#### From the above three models the score, precision and accurary, It can be implied that random forest classification has the best fit model for the data given, since it is binary classification problem. Although decision tree classifier performed better, the score of that model is less compared to random forest classification.

## Testing with test dataset without Survived column

In [11]:
## transforming the Sex column using label encoder.
test_df_le = test_df
test_df_le.Sex = le.fit_transform(test_df_le.Sex)
X_test_df = test_df_le.drop(['Name','Ticket','Fare','Cabin','Embarked'], axis="columns")

In [12]:
## removing the NaN data in Age column and fill with mean
X_test_df.isna().any()
X_test_df.Age.fillna(X_test_df.Age.mean(), inplace=True)

In [13]:
## predict the survived using the test dataset
X_test_df['Survived']=y_predict=model_rfc.predict(X_test_df)

In [14]:
## writing the data to CSV for kaggle submission.
X_test_df.drop(['Sex','Pclass','Age','SibSp','Parch'],axis='columns').to_csv('gender_submission.csv', index=False)