In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib notebook

In [5]:
df = pd.read_csv("train.csv")

In [6]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Data exploration

In [8]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


We have lots of null values in cabin and a few in age column

In [10]:
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [11]:
survived = df[df['Survived'] == 1]

### Sex

In [12]:
#Checking if gender has any role in survival chances 
gender_survived = survived.groupby("Sex")['Sex'].count()
gender_total = df.groupby("Sex")['Sex'].count()

survived_percent = gender_survived/gender_total
survived_percent

Sex
female    0.742038
male      0.188908
Name: Sex, dtype: float64

In [13]:
plt.figure()
sns.countplot(x = "Survived", hue = 'Sex', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f596bda1790>

74.2% Women survived and only 18.8% men survived

### Embarked

In [14]:
#Embarked column tells us where the passenger boarded the boat. Checking whether it has
#any effect on survival chances
embark_survived = survived.groupby("Embarked")['Embarked'].count()
embark_total = df.groupby('Embarked')["Embarked"].count()

survived_percent_embark = embark_survived/embark_total
survived_percent_embark

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Embarked, dtype: float64

In [15]:
plt.figure()
sns.countplot(x = 'Survived', hue = 'Embarked', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f596bdd2970>

So embarked column has a little effect on the survival chances:
        
    C = Cherbourg = 55.3% chance
    Q = Queenstown = 38.9% chance
    S = Southampton = 33.69% chance

### Cabin

In [16]:
#Cabin column has around 687 null values and only 204 values present, it will be practical
#to drop this feature as it will hamper the performance of the model
df.drop('Cabin', axis = 1, inplace = True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Source:

(Video)Handling Missing Data by Krish Naik 

(https://www.youtube.com/watch?v=P_iMSYQnqac&t=1260s)

### Pclass

In [17]:
pclass_survived = survived.groupby("Pclass")['Pclass'].count()
pclass_total = df.groupby('Pclass')["Pclass"].count()

survived_percent_pclass = pclass_survived/pclass_total
survived_percent_pclass

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Pclass, dtype: float64

In [18]:
plt.figure()
sns.countplot(x = 'Survived', hue = 'Pclass', data = df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f596850daf0>

As we see, Pclass 3 passengers had low chance of survival, while Pclass 1 passengers had high chance.

So wealth status played a huge role in Survival Chances

### SibSp

In [19]:
df["SibSp"].head()

0    1
1    1
2    0
3    1
4    0
Name: SibSp, dtype: int64

In [20]:
#SibSp is a discrete variable which shows the number of siblings or spouses passengers had

In [21]:
plt.figure()
plt.hist(df["SibSp"])
plt.title("SibSp distribution")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'SibSp distribution')

Right Skewed data...will require normalization

### Parch

In [22]:
df["Parch"].head(15)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     1
8     2
9     0
10    1
11    0
12    0
13    5
14    0
Name: Parch, dtype: int64

In [23]:
#Parch is a discrete variable showing no of Parents accompaning the children or 
#children accompanying parents

In [24]:
plt.figure()
plt.hist(df["Parch"])
plt.title("Parch distribution")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Parch distribution')

Highly right skewed data, will require normalization

### Ticket

In [25]:
df["Ticket"].head()

0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [26]:
len(df["Ticket"].unique())

681

In [27]:
map_dict = df["Ticket"].value_counts().to_dict()
map_dict

{'CA. 2343': 7,
 '347082': 7,
 '1601': 7,
 'CA 2144': 6,
 '3101295': 6,
 '347088': 6,
 'S.O.C. 14879': 5,
 '382652': 5,
 '347077': 4,
 '17421': 4,
 '4133': 4,
 'LINE': 4,
 '113760': 4,
 'W./C. 6608': 4,
 '349909': 4,
 '113781': 4,
 '19950': 4,
 'PC 17757': 4,
 '2666': 4,
 '35273': 3,
 '29106': 3,
 '371110': 3,
 'SC/Paris 2123': 3,
 '230080': 3,
 'C.A. 34651': 3,
 'C.A. 31921': 3,
 '363291': 3,
 '347742': 3,
 '345773': 3,
 '13502': 3,
 '239853': 3,
 'PC 17582': 3,
 '110152': 3,
 'PC 17760': 3,
 '248727': 3,
 '110413': 3,
 'PC 17572': 3,
 'PC 17755': 3,
 'F.C.C. 13529': 3,
 '24160': 3,
 '244252': 2,
 '28403': 2,
 '244367': 2,
 'C.A. 37671': 2,
 'PC 17608': 2,
 '250644': 2,
 '2651': 2,
 '113572': 2,
 'C.A. 33112': 2,
 '376564': 2,
 '17453': 2,
 '2691': 2,
 'PC 17761': 2,
 '364849': 2,
 '230136': 2,
 '7534': 2,
 '113806': 2,
 'PC 17611': 2,
 '2659': 2,
 '243847': 2,
 '113803': 2,
 '3101278': 2,
 '347054': 2,
 'P/PP 3381': 2,
 '2627': 2,
 '113798': 2,
 '26360': 2,
 'C.A. 2315': 2,
 '31027':

As we have 681 unique values in Ticket column, we will be keeping only top 10 frequent categories, and labelling the others in "other" column.

Source:
Kaggle: How to Handle large number of categorical values
(https://www.kaggle.com/getting-started/37489)

code credits:
(Video)Featuring Engineering- Handle Categorical Features Many Categories by Krish Naik 
(https://www.youtube.com/watch?v=MPnNC6kkNC4)

In [28]:
top_dict = df["Ticket"].value_counts().head(10).to_dict()
top_list = [x for x in top_dict]
top_list

['CA. 2343',
 '347082',
 '1601',
 'CA 2144',
 '3101295',
 '347088',
 'S.O.C. 14879',
 '382652',
 '347077',
 '17421']

In [29]:
for i in top_list:
    df[i] = np.where(df["Ticket"]==i,1,0)
    

In [30]:
def ticket_other(row):
    if row["Ticket"] not in top_list:
        row["other"] = 1
    elif row["Ticket"] in top_list:
        row["other"] = 0
    else:
        print("Not working")
    return row
df = df.apply(ticket_other, axis = 1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,347082,1601,CA 2144,3101295,347088,S.O.C. 14879,382652,347077,17421,other
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,0,0,0,0,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,0,0,0,0,0,0,0,1


### Age

In [31]:
df["Age"].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [32]:
plt.figure().set_size_inches(8, 4)
plt.subplot(1,2,1)
sns.distplot(a = df["Age"].dropna(), kde = True).set(yticklabels=[])
plt.title("Age distribution of all people")
plt.subplot(1,2,2)
sns.distplot(a = survived["Age"].dropna(), kde = True).set(yticklabels=[])
plt.title("Age distribution of survived people")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Age distribution of survived people')

This is a normal distribution. There is no need for normalization.
Scaling may be required

we have multiple missing values in the column, as this follows a normal distribution, we can replace the nan values with the mean or median

Now checking whether Age has correlation with any other variable:

In [33]:
df_age = df[["Age", "Fare", "Pclass", "Sex"]]

In [34]:
plt.figure()
sns.heatmap(df_age.corr(), annot = True)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f596831e700>

unclear

In [35]:
plt.figure()
sns.boxplot(y = 'Age', x = "Pclass", data = df_age)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f59681aefd0>

In [36]:
plt.figure()
sns.boxplot(y = 'Age', x = "Sex", data = df_age)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f59681e6160>

In [37]:
#As we see, we can easily distinguish the median age for Pclass each pclass,
#so replacing age with the medians
pclass_median = df_age.groupby("Pclass")["Age"].agg(np.median)
pclass_median

Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64

In [38]:
def age_replace(row):
    if row["Pclass"] == 1:
        row["Age"] = 37
    elif row["Pclass"] == 2:
        row["Age"] = 29
    elif row["Pclass"] == 3:
        row["Age"] = 24
    return row
df = df.apply(age_replace, axis = 1)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   891 non-null    int64  
 1   Survived      891 non-null    int64  
 2   Pclass        891 non-null    int64  
 3   Name          891 non-null    object 
 4   Sex           891 non-null    object 
 5   Age           891 non-null    int64  
 6   SibSp         891 non-null    int64  
 7   Parch         891 non-null    int64  
 8   Ticket        891 non-null    object 
 9   Fare          891 non-null    float64
 10  Embarked      889 non-null    object 
 11  CA. 2343      891 non-null    int64  
 12  347082        891 non-null    int64  
 13  1601          891 non-null    int64  
 14  CA 2144       891 non-null    int64  
 15  3101295       891 non-null    int64  
 16  347088        891 non-null    int64  
 17  S.O.C. 14879  891 non-null    int64  
 18  382652        891 non-null    

## Data Normalization

## Handling Categorical Data

### Section1: Categorical data

First, we need to drop column "Name" 

In [40]:
df = df.drop("Name", axis =1)

#### Sex : One Hot encoding

In [41]:
df_sex = pd.get_dummies(df["Sex"])
df = pd.concat([df_sex, df], axis = 1)
df.drop("Sex",axis = 1, inplace = True)

#### Pclass: One hot encoding

In [42]:
df_pclass = pd.get_dummies(df["Pclass"])
df = pd.concat([df_pclass, df], axis = 1)
df.drop("Pclass",axis = 1, inplace = True)

### Embarked: one hot ncoding

In [43]:
df_embarked = pd.get_dummies(df["Embarked"])
df = pd.concat([df_embarked, df], axis = 1)
df.drop("Embarked",axis = 1, inplace = True)

#### Tickets : Already done 

### Section 2 : Normalization

#### SibSp

Normalization wasn't possible

#### Parch

Normalization not possible

#### Fare

In [44]:
plt.figure().set_size_inches(8,4)
plt.subplot(1,2,1)
sns.distplot(a = df["Fare"], kde = True)
plt.title("Fare Distribution before normalization")
df["Log_fare"] = np.log(df["Fare"] + 1)
plt.subplot(1,2,2)
sns.distplot(a = df["Log_fare"], kde = True)
plt.title("Fare Distribution after normalization")


<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Fare Distribution after normalization')

Tried square root transformation, exponential transformation, boxcox, got best results from log transform

## Data Preprocessing

In [45]:
df.columns

Index([           'C',            'Q',            'S',              1,
                    2,              3,       'female',         'male',
        'PassengerId',     'Survived',          'Age',        'SibSp',
              'Parch',       'Ticket',         'Fare',     'CA. 2343',
             '347082',         '1601',      'CA 2144',      '3101295',
             '347088', 'S.O.C. 14879',       '382652',       '347077',
              '17421',        'other',     'Log_fare'],
      dtype='object')

In [46]:
df.head(10)

Unnamed: 0,C,Q,S,1,2,3,female,male,PassengerId,Survived,...,1601,CA 2144,3101295,347088,S.O.C. 14879,382652,347077,17421,other,Log_fare
0,0,0,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,1,2.110213
1,1,0,0,1,0,0,1,0,2,1,...,0,0,0,0,0,0,0,0,1,4.280593
2,0,0,1,0,0,1,1,0,3,1,...,0,0,0,0,0,0,0,0,1,2.188856
3,0,0,1,1,0,0,1,0,4,1,...,0,0,0,0,0,0,0,0,1,3.990834
4,0,0,1,0,0,1,0,1,5,0,...,0,0,0,0,0,0,0,0,1,2.202765
5,0,1,0,0,0,1,0,1,6,0,...,0,0,0,0,0,0,0,0,1,2.246893
6,0,0,1,1,0,0,0,1,7,0,...,0,0,0,0,0,0,0,0,1,3.967694
7,0,0,1,0,0,1,0,1,8,0,...,0,0,0,0,0,0,0,0,1,3.094446
8,0,0,1,0,0,1,1,0,9,1,...,0,0,0,0,0,0,0,0,1,2.495954
9,1,0,0,0,1,0,1,0,10,1,...,0,0,0,0,0,0,0,0,1,3.436268


In [47]:
df.drop(['PassengerId','Ticket','Fare'], axis = 1, inplace = True)

In [48]:
y = df["Survived"]
X = df.drop("Survived", axis = 1)

In [49]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25)

In [50]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.fit_transform(X_test)

### Model 1: Decision Trees

In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier().fit(X_train,y_train)

In [71]:
from sklearn.metrics import accuracy_score
y_predict = dt_clf.predict(X_test)
accuracy_DT = accuracy_score(y_test, y_predict)
accuracy_DT

0.8385650224215246

In [72]:
cv = cross_val_score(dt_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.78358209 0.76865672 0.7761194  0.76691729 0.81954887]
0.7829648748737515


### Model 2: KNN

In [64]:
from sklearn.neighbors import KNeighborsClassifier
max_acc = 0
k = 0
for i in range(1,50,2):
    knn_clf = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    y_predict = knn_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)
    if accuracy>max_acc:
        max_acc = accuracy
        k = i
    print("Accuracy score of k = {} is {}".format(i, accuracy))
print("Max accuracy score is {} for k = {}".format(max_acc, k))
    

Accuracy score of k = 1 is 0.7757847533632287
Accuracy score of k = 3 is 0.7802690582959642
Accuracy score of k = 5 is 0.8161434977578476
Accuracy score of k = 7 is 0.8385650224215246
Accuracy score of k = 9 is 0.8071748878923767
Accuracy score of k = 11 is 0.8071748878923767
Accuracy score of k = 13 is 0.8116591928251121
Accuracy score of k = 15 is 0.8251121076233184
Accuracy score of k = 17 is 0.820627802690583
Accuracy score of k = 19 is 0.8295964125560538
Accuracy score of k = 21 is 0.8340807174887892
Accuracy score of k = 23 is 0.8295964125560538
Accuracy score of k = 25 is 0.8251121076233184
Accuracy score of k = 27 is 0.8251121076233184
Accuracy score of k = 29 is 0.8251121076233184
Accuracy score of k = 31 is 0.8251121076233184
Accuracy score of k = 33 is 0.8071748878923767
Accuracy score of k = 35 is 0.8116591928251121
Accuracy score of k = 37 is 0.8071748878923767
Accuracy score of k = 39 is 0.8026905829596412
Accuracy score of k = 41 is 0.8071748878923767
Accuracy score of k

The highest score is 0.83 for k = 7

In [80]:
knn_clf = KNeighborsClassifier(n_neighbors = 7).fit(X_train,y_train)
cv = cross_val_score(knn_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.7761194  0.76119403 0.82089552 0.7593985  0.78947368]
0.7814162271350018


### Model 3: Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter = 1000).fit(X_train,y_train)
y_predict = lr_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8026905829596412


In [57]:
cv = cross_val_score(lr_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.7761194  0.75373134 0.78358209 0.76691729 0.82706767]
0.7814835596453821


### Model 4: SVC

In [58]:
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'linear').fit(X_train,y_train)
y_predict = svc_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8071748878923767


In [59]:
cv = cross_val_score(svc_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.79104478 0.75373134 0.79104478 0.81954887 0.83458647]
0.7979912467736505


In [60]:
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'rbf').fit(X_train,y_train)
y_predict = svc_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.7130044843049327


In [61]:
cv = cross_val_score(svc_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.64179104 0.64179104 0.70895522 0.64661654 0.69924812]
0.6676803950173942


### Model 6: Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier().fit(X_train, y_train)
y_predict = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8295964125560538


In [63]:
cv = cross_val_score(rf_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.76119403 0.7761194  0.79850746 0.7443609  0.79699248]
0.7754348557962069


Therefore the best models for predicting are Decision Trees (0.838), KNN classifier with k = 7(0.838) and Random Forest(0.829)

Second best is Logistic regression(0.802) and SVC with linear kernel(0.807)

In [78]:
from sklearn.model_selection import GridSearchCV 
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear', 'sag']}

lr_clf = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_lr_clf = lr_clf.fit(X_train_scaled,y_train)
print('Best Score: ' + str(best_lr_clf.best_score_))
print('Best Parameters: ' + str(best_lr_clf.best_params_))


Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    0.1s


Best Score: 0.8039277297721916
Best Parameters: {'C': 0.012742749857031334, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.1s finished


In [79]:
y_predict = best_lr_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.35874439461883406


In [75]:
svc = SVC(probability = False)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 ]
svc_clf = GridSearchCV(svc, param_grid = param_grid, cv = 3, verbose = True, n_jobs = -1)
best_svc_clf = svc_clf.fit(X_train_scaled,y_train)
print('Best Score: ' + str(best_svc_clf.best_score_))
print('Best Parameters: ' + str(best_svc_clf.best_params_))

Fitting 3 folds for each of 35 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  94 out of 105 | elapsed:    0.8s remaining:    0.1s


Best Score: 0.8009736193592696
Best Parameters: {'C': 10, 'kernel': 'linear'}


[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    5.7s finished


In [76]:
y_predict = best_svc_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.8026905829596412


we will be choosing Random Forest for our model