#### importing necessary libraries

In [121]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer # for impuation = filling nan or missing values
from sklearn.preprocessing import LabelEncoder # for encoding categorical data

from sklearn.model_selection import train_test_split #for splitting data to train and test data

# algorithms to built models 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [122]:
# loading dataset

In [3]:
df=pd.read_csv(r'E:/ML/Datasets/titanic.csv')

In [123]:
# checking top 5 rows

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [124]:
#checking is there is any missing values

In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [125]:
# getting insight of Dataset

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    int32  
 8   Embarked  891 non-null    int32  
dtypes: float64(2), int32(2), int64(5)
memory usage: 55.8 KB


In [127]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,5.946128,1.536476
std,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,2.062347,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,7.0,1.0
50%,0.0,3.0,1.0,29.699118,0.0,0.0,14.4542,7.0,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,7.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,8.0,2.0


### Imputation

In [13]:
df['Cabin']=np.where(df['Cabin'].isnull(),'M',df['Cabin'])
df['Cabin']=df['Cabin'].apply(lambda x:x[0])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,M,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,M,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,M,S


In [14]:
#handling age and emarked column

In [28]:
#simple imputer strategy = 'mean' default

In [25]:
si=SimpleImputer()
df['Age']=si.fit_transform(df[['Age']])

In [29]:
#simple imputer strategy = 'most_frequent'

In [30]:
simf = SimpleImputer(strategy='most_frequent')
df['Embarked']=simf.fit_transform(df[['Embarked']])

In [32]:
encode = LabelEncoder()
df['Sex']=encode.fit_transform(df['Sex'])
df['Cabin']=encode.fit_transform(df['Cabin'])
df['Embarked']=encode.fit_transform(df['Embarked'])

In [33]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,7,2


In [36]:
df.drop(columns=['PassengerId','Name','Ticket'],inplace=True)

In [37]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,2
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,7,2
3,1,1,0,35.0,1,0,53.1,2,2
4,0,3,1,35.0,0,0,8.05,7,2


In [None]:
# splitting data to training and test data

In [38]:
xtrain,xtest,ytrain,ytest=train_test_split(df.iloc[:,1:],df.iloc[:,0],test_size=0.3,random_state=42)

In [39]:
xtrain.values

array([[  1.        ,   1.        ,   4.        , ...,  81.8583    ,
          0.        ,   2.        ],
       [  3.        ,   1.        ,  29.69911765, ...,   7.8958    ,
          7.        ,   2.        ],
       [  3.        ,   0.        ,   1.        , ...,  11.1333    ,
          7.        ,   2.        ],
       ...,
       [  3.        ,   1.        ,  41.        , ...,  14.1083    ,
          7.        ,   2.        ],
       [  1.        ,   0.        ,  14.        , ..., 120.        ,
          1.        ,   2.        ],
       [  1.        ,   1.        ,  21.        , ...,  77.2875    ,
          3.        ,   2.        ]])

### Standardization

In [42]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
xtrain_ss=scaler.fit_transform(xtrain)
xtest_ss=scaler.fit_transform(xtest)

### Normalizing Data

In [70]:
from sklearn.preprocessing import MinMaxScaler
MinMax=MinMaxScaler()
xtrain_mm=MinMax.fit_transform(xtrain)
xtest_mm=MinMax.fit_transform(xtest)

### Power Transformer

In [46]:
from sklearn.preprocessing import PowerTransformer
boxcox=PowerTransformer(method='box-cox')
xtrain_boxcox=boxcox.fit_transform(xtrain+0.00001)
xtest_boxcox=boxcox.fit_transform(xtest+0.000001)

##### LOGISTIC REGRESSION

In [79]:
from sklearn.linear_model import LogisticRegression
lor=LogisticRegression()

In [62]:
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,r2_score

In [72]:
# accuracy without scaling

In [74]:
lor.fit(xtrain,ytrain)
ypred = lor.predict(xtest)
accuracy_score(ypred,ytest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8208955223880597

##### checking Logistic Regression accuracy on boxcox

In [86]:
# accuracy score boxcox
lor.fit(xtrain_boxcox,ytrain)
ypred_boxcox = lor.predict(xtest_boxcox)
accuracy_score(ytest,ypred_boxcox)

0.8171641791044776

##### checking Logistic Regression accuracy on standard scaling

In [85]:
# accuracy score standardScaler
lor.fit(xtrain_ss,ytrain)
ypred_scaler = lor.predict(xtest_ss)
accuracy_score(ytest,ypred_scaler)

0.8134328358208955

##### checking Logistic Regression accuracy on MinMax scaling

In [84]:
# accuracy score MinMax
lor.fit(xtrain_mm,ytrain)
ypred_mm = lor.predict(xtest_mm)
accuracy_score(ytest,ypred_mm)

0.8171641791044776

### Support Vector Machine

In [80]:
from sklearn.svm import SVC
svm=SVC()

In [81]:
# accuracy without scaling

In [83]:
svm.fit(xtrain,ytrain)
ypred = svm.predict(xtest)
accuracy_score(ytest,ypred)

0.6604477611940298

##### checking SVM accuracy on MinMax scaling

In [88]:
svm.fit(xtrain_mm,ytrain)
ypred_mm = svm.predict(xtest_mm)
accuracy_score(ytest,ypred_mm)

0.7835820895522388

##### checking SVM accuracy on Standard scaling

In [89]:
svm.fit(xtrain_ss,ytrain)
ypred_ss = svm.predict(xtest_ss)
accuracy_score(ytest,ypred_ss)

0.8171641791044776

##### checking SVM accuracy on Boxcox

In [90]:
svm.fit(xtrain_boxcox,ytrain)
ypred_boxcox = svm.predict(xtest_boxcox)
accuracy_score(ytest,ypred_boxcox)

0.8246268656716418

### Decision Tree

In [91]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()

In [93]:
dtc.fit(xtrain,ytrain)
ypred = dtc.predict(xtest)
accuracy_score(ytest,ypred)

0.7574626865671642

##### checking Decision Tree accuracy on Boxcox

In [97]:
dtc.fit(xtrain_boxcox,ytrain)
ypred_boxcox = dtc.predict(xtest_boxcox)
accuracy_score(ytest,ypred_boxcox)

0.7649253731343284

##### checking Decision Tree accuracy on MimMax

In [98]:
dtc.fit(xtrain_mm,ytrain)
ypred_mm = dtc.predict(xtest_mm)
accuracy_score(ytest,ypred_mm)

0.7649253731343284

##### checking Decision Tree accuracy on Standard Scaler

In [99]:
dtc.fit(xtrain_ss,ytrain)
ypred_ss = dtc.predict(xtest_ss)
accuracy_score(ytest,ypred_ss)

0.7723880597014925

### Random Forest

In [100]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [101]:
# Without scaling

In [103]:
rfc.fit(xtrain,ytrain)
ypred= rfc.predict(xtest)
accuracy_score(ytest,ypred)

0.7910447761194029

##### checking Random Forest accuracy on Standard Scaler

In [104]:
rfc.fit(xtrain_ss,ytrain)
ypred_ss= rfc.predict(xtest_ss)
accuracy_score(ytest,ypred_ss)

0.7910447761194029

##### checking Random Forest accuracy on MinMax Scaling

In [105]:
rfc.fit(xtrain_mm,ytrain)
ypred_mm= rfc.predict(xtest_mm)
accuracy_score(ytest,ypred_mm)

0.7761194029850746

##### checking Random Forest accuracy on boxcox

In [106]:
rfc.fit(xtrain_boxcox,ytrain)
ypred_boxcox= rfc.predict(xtest_boxcox)
accuracy_score(ytest,ypred_boxcox)

0.7761194029850746