In [839]:
import pandas as pd
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV

In [840]:
df_train = pd.read_csv('titanic_train.csv')

In [841]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [842]:
df_train = df_train.drop('Name', axis=1,)
df_train = df_train.drop('Ticket', axis=1,)
df_train = df_train.drop('Fare', axis=1,)
df_train = df_train.drop('Cabin', axis=1,)

In [843]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Embarked        object
dtype: object

In [844]:
df_train['Family'] = df_train['SibSp'] + df_train['Parch'] + 1

In [845]:
df_train = df_train.drop('SibSp', axis=1,)
df_train = df_train.drop('Parch', axis=1,)

In [846]:
df_train["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [847]:
def check_nans():
    feat_list = list(df_train.columns.values)

    for feat in feat_list:
        print (feat,": ",sum(pd.isnull(df_train[feat])))

check_nans()

PassengerId :  0
Survived :  0
Pclass :  0
Sex :  0
Age :  177
Embarked :  2
Family :  0


In [848]:
df_train["Age"] = df_train["Age"].fillna(df_train["Age"].median())

In [849]:
df_train["Embarked"].mode()

0    S
Name: Embarked, dtype: object

In [850]:
df_train["Embarked"] = df_train["Embarked"].fillna("S")

In [851]:
check_nans()

PassengerId :  0
Survived :  0
Pclass :  0
Sex :  0
Age :  0
Embarked :  0
Family :  0


In [852]:
df_train["Age"].describe()

count    891.000000
mean      29.361582
std       13.019697
min        0.420000
25%       22.000000
50%       28.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [853]:
df_train["Adult"] = 0

In [854]:
df_train["Adult"][df_train["Age"] >= 18] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["Adult"][df_train["Age"] >= 18] = 1


In [855]:
print ("Passengers more than 18 yrs old: ",str(len(df_train[df_train["Age"] >= 18])))
print ("Number of Adults: ",str(len(df_train[df_train["Adult"] >= 1])))

Passengers more than 18 yrs old:  778
Number of Adults:  778


In [856]:
df_train = df_train.drop('Age', axis=1,)

In [857]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Embarked,Family,Adult
0,1,0,3,male,S,2,1
1,2,1,1,female,C,2,1
2,3,1,3,female,S,1,1
3,4,1,1,female,S,2,1
4,5,0,3,male,S,1,1


In [858]:
df1 = df_train.filter(['Pclass','Sex','Embarked','Family','Adult'], axis=1)
X = df1

In [859]:
df2 = df_train['Survived']
y = df2

In [860]:
X["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [861]:
X["Sex"][df_train["Sex"] == "male"] = 1
X["Sex"][df_train["Sex"] == "female"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Sex"][df_train["Sex"] == "male"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Sex"][df_train["Sex"] == "female"] = 2


In [862]:
X = X.drop('Embarked', axis=1,)
X = X.drop('Family', axis=1,)

In [863]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [864]:
features_train.shape

(623, 3)

In [865]:
features_test.shape

(268, 3)

In [866]:
labels_train.shape

(623,)

In [867]:
labels_test.shape

(268,)

In [868]:
X.head()

Unnamed: 0,Pclass,Sex,Adult
0,3,1,1
1,1,2,1
2,3,2,1
3,1,2,1
4,3,1,1


In [869]:
knn = KNeighborsClassifier()
k_range = list(range(1,10))
weights_options = ['uniform','distance']
k_grid = dict(n_neighbors=k_range, weights = weights_options)
grid = GridSearchCV(knn, k_grid, cv=10, scoring = 'precision')
grid.fit(features_train, labels_train)

In [870]:
grid.cv_results_

{'mean_fit_time': array([0.00505633, 0.00279832, 0.00312414, 0.00284111, 0.00294409,
        0.00299919, 0.00341229, 0.00327244, 0.00386822, 0.00417221,
        0.00433264, 0.00418727, 0.00351007, 0.00377986, 0.0033612 ,
        0.00264082, 0.0026484 , 0.00263736]),
 'std_fit_time': array([0.00208304, 0.00011866, 0.00022046, 0.00020152, 0.00028852,
        0.00046006, 0.00073465, 0.0008931 , 0.00049744, 0.00032322,
        0.00023961, 0.00049724, 0.00062357, 0.00062309, 0.00065443,
        0.00017934, 0.00022422, 0.00025518]),
 'mean_score_time': array([0.00961781, 0.00355256, 0.00662117, 0.00366671, 0.0059056 ,
        0.00403533, 0.00735035, 0.00409386, 0.00865421, 0.00546942,
        0.00967765, 0.00523703, 0.00765064, 0.0050215 , 0.00747581,
        0.00343215, 0.00552523, 0.00345006]),
 'std_score_time': array([0.0041459 , 0.00037469, 0.00035043, 0.00023583, 0.00030068,
        0.00070233, 0.00149557, 0.00083462, 0.0012223 , 0.00056242,
        0.00037835, 0.00095495, 0.00134034, 

In [871]:
print ("Best Score: ",str(grid.best_score_))
print ("Best Parameters: ",str(grid.best_params_))
print ("Best Estimators: ",str(grid.best_estimator_))

Best Score:  0.8998137973137974
Best Parameters:  {'n_neighbors': 8, 'weights': 'distance'}
Best Estimators:  KNeighborsClassifier(n_neighbors=8, weights='distance')


In [872]:
label_pred = grid.predict(features_test)

In [873]:
acc_clf = metrics.accuracy_score(labels_test,label_pred)
print ("classifier's accuracy: ",str(acc_clf) )

classifier's accuracy:  0.7910447761194029


In [874]:
scr_clf = precision_recall_fscore_support(labels_test,label_pred, average='weighted')

print ("classifier's precision: ",str(scr_clf[0]))
print ("classifier's recall: ",str(scr_clf[1]))
print ("classifier's fbeta_score: ",str(scr_clf[2]))

classifier's precision:  0.8077389658661381
classifier's recall:  0.7910447761194029
classifier's fbeta_score:  0.7807467111084891


In [875]:
labels_pred = grid.predict(features_test)
labels_pred

array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0])

In [876]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_test, labels_pred)
print('Confusion matrix\n\n', cm)

Confusion matrix

 [[148   9]
 [ 47  64]]


In [877]:
from sklearn import metrics
print(metrics.accuracy_score(labels_test, labels_pred))
print(metrics.precision_score(labels_test, labels_pred))
print(metrics.recall_score(labels_test, labels_pred))
print(metrics.f1_score(labels_test, labels_pred))

0.7910447761194029
0.8767123287671232
0.5765765765765766
0.6956521739130436


In [878]:
df_test = pd.read_csv('titanic.csv')
final_y = df_train['Survived']
df_test.dtypes

pclass       float64
survived     float64
name          object
sex           object
age          float64
sibsp        float64
parch        float64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [879]:
df_test.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [880]:
# test_set = df_test.groupby(['Pclass', 'Sex', 'Age'])
test_set = df_test[['pclass', 'sex', 'age']].copy()

In [881]:
# x_test_main.head()

In [882]:
test_set.columns.values

array(['pclass', 'sex', 'age'], dtype=object)

In [883]:
def check_nans_test():
    test_list = list(test_set.columns.values)
    for test_item in test_list:
        print (test_item,": ",sum(pd.isnull(test_set[test_item])))

check_nans_test()

pclass :  1
sex :  1
age :  264


In [884]:
test_set["sex"][test_set["sex"] == "male"] = 1
test_set["sex"][test_set["sex"] == "female"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set["sex"][test_set["sex"] == "male"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set["sex"][test_set["sex"] == "female"] = 2


In [885]:
test_set["age"] = test_set["age"].fillna(test_set["age"].median())
test_set["pclass"] = test_set["pclass"].fillna(test_set["pclass"].median())
test_set["sex"] = test_set["sex"].fillna(test_set["sex"].median())

In [886]:
test_set["Adult"] = 0
test_set["Adult"][test_set["age"] >= 18] = 1
test_set = test_set.drop('age', axis=1,)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set["Adult"][test_set["age"] >= 18] = 1


In [887]:
test_set.head()

Unnamed: 0,pclass,sex,Adult
0,1.0,2.0,1
1,1.0,1.0,0
2,1.0,2.0,0
3,1.0,1.0,1
4,1.0,2.0,1


In [888]:
check_nans_test()

pclass :  0
sex :  0
Adult :  0


In [889]:
final_pred = grid.predict(test_set)
final_pred

Feature names unseen at fit time:
- pclass
- sex
Feature names seen at fit time, yet now missing:
- Pclass
- Sex



array([1, 1, 1, ..., 0, 0, 0])

In [890]:
len(final_pred)

1310