In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB


def import_data(data):
    _filedir = 'data/titanic/'
    _filename = data
    _file = _filedir + _filename + '.csv'
    return _file


def calc_and_fill(dataframe):
    # Calculate mean age of passengers
    calc_mean_age = dataframe['Age'].mean()
    # print(calc_mean_age)
    # print(get_mean)
    dataframe['Age'] = dataframe['Age'].fillna(value=calc_mean_age)
    return dataframe


def get_missing_data(dataframe):
    get_null_data = dataframe.isnull().sum().sort_values(ascending=False)
    # calculate the percentage
    calc_percent = dataframe.isnull().sum() / dataframe.isnull().count() * 100
    # round values sort not ascending
    calc_percent = (round(calc_percent, 1)).sort_values(ascending=False)
    # get boolean values != 0
    criteria = calc_percent != 0
    # concat get_null_data + calc_percent, show row 'Total' & '&'
    missing_data = pd.concat([get_null_data, calc_percent],
                             axis=1,
                             keys=['Total', '%'])
    # show missing_data where with True boolean
    return missing_data[criteria]


def convert_bool_2_int(dataframe):
    d = {'female': 0, 'male': 1}
    #dataframe = dataframe.copy()
    # fill female & male strings
    dataframe['Sex'] = dataframe['Sex'].map(d)
    return dataframe


# Check dataframe for non Int / Float values
def show_nonint(dataframe):
    obj_df = train_df.select_dtypes(include=['object']).copy()
    return obj_df.head()


def rand__forest(n_est, m_depth):
    m = RandomForestClassifier(n_estimators=n_est,
                               max_depth=m_depth,
                               criterion="gini")
    m.fit(Xtrain, ytrain.values.ravel())
    score = m.score(Xtrain, ytrain)
    print(score)


def log_reg():
    m = LogisticRegression(solver='lbfgs', max_iter=400)
    m.fit(Xtrain, ytrain.values.ravel())
    print("train score:", m.score(Xtrain, ytrain))
    print("test score:", m.score(Xtest, ytest))


def acc_score_train():
    m.fit(Xtrain, ytrain.values.ravel())

    ypred = m.predict(Xtrain)
    #print(ypred)
    print('accuracy score: ', accuracy_score(ytrain, ypred))
    print('precision score:', precision_score(ytrain, ypred))
    print('recall score:   ', recall_score(ytrain, ypred))


def acc_score_test():
    m.fit(Xtest, ytest.values.ravel())
    ypred = m.predict(Xtest)
    print('accuracy score: ', accuracy_score(ytest, ypred))
    print('precision score:', precision_score(ytest, ypred))
    print('recall score:   ', recall_score(ytest, ypred))

#### Import csv file

In [2]:
#import data via pandas read csv
train_df = pd.read_csv(import_data('train'))
#test_df = pd.read_csv(import_data('test'))


#### Show dataframe info

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


#### Show dataframe summary

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#### Show missing data in %

In [5]:
get_missing_data(train_df)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2


#### Check df for non int / float values


In [6]:
show_nonint(train_df)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


#### Extract Salutations / create Column 'Title' & add Salutation

In [7]:
# Kaggle Source- https://www.kaggle.com/ash316/eda-to-prediction-dietanic
train_df['Title'] = 0
train_df['Title'] = train_df.Name.str.extract(
   '([A-Za-z]+)\.')  #lets extract the Salutations

In [8]:
#define dict for female and male (replace str by inr)
d = {'female': 0, 'male': 1}

# Assign dict d to column 'Sex'
train_df['Sex'] = train_df['Sex'].map(d)

#define dict for title
title = {
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2,
    'Master': 3,
    'Don': 4,
    'Rev': 5,
    'Dr': 6,
    'Mme': 7,
    'Ms': 8,
    'Major': 9,
    'Lady': 10,
    'Sir': 11,
    'Mlle': 12,
    'Col': 13,
    'Capt': 14,
    'Countness': 15,
    'Jonkheer': 16
}

# Assign dict title to column 'Title'
train_df['Title'] = train_df['Title'].map(title)

#Get dummydata form column 'Embarked'
dummydata = pd.get_dummies(train_df['Embarked'])

#Concat dataframe train_df with dataframe dummydata
train_df = pd.concat([train_df, dummydata], axis=1).drop(['Embarked'], axis=1)

In [9]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Title            1
C                0
Q                0
S                0
dtype: int64

#### Drop Row with Null data

In [10]:
train_df.dropna(subset=['Title'],inplace=True)

#### Show Null data

In [11]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Title            0
C                0
Q                0
S                0
dtype: int64

#### Show non Int Data

In [12]:
show_nonint(train_df)

Unnamed: 0,Name,Ticket,Cabin
0,"Braund, Mr. Owen Harris",A/5 21171,
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,C85
2,"Heikkinen, Miss. Laina",STON/O2. 3101282,
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,C123
4,"Allen, Mr. William Henry",373450,


#### Drop Columns  'Name', 'Ticket', 'Cabin'

In [13]:
#get rid of of column ticket and cabin
train_df.drop(['Name', 'Ticket', 'Cabin'], inplace = True, axis=1)
#train_df.head()

In [14]:
train_df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Title            0
C                0
Q                0
S                0
dtype: int64

In [15]:
mean_age = train_df['Age'].mean()
train_df['Age'] = train_df['Age'].fillna(value=mean_age)



train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title,C,Q,S
0,1,0,3,1,22.0,1,0,7.25,0.0,0,0,1
1,2,1,1,0,38.0,1,0,71.2833,1.0,1,0,0
2,3,1,3,0,26.0,0,0,7.925,2.0,0,0,1
3,4,1,1,0,35.0,1,0,53.1,1.0,0,0,1
4,5,0,3,1,35.0,0,0,8.05,0.0,0,0,1


#### Show Null data

In [16]:
train_df.isna().sum()


PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Title          0
C              0
Q              0
S              0
dtype: int64

In [17]:
#train_df.head(2)

#### Define Columns for X and y for splitting

In [18]:
#Choose Columns "Pclass","Sex","Age"
X = train_df[["Pclass","Sex","Age"]]
#Choose Column"Survived"
y = train_df[["Survived"]]

len(X) == len(y)

True

### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
#immer erst split dann feature engeniering
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=40)

m = LogisticRegression(solver='lbfgs')

In [20]:
m.fit(Xtrain, ytrain.values.ravel())
print("train score:", m.score(Xtrain, ytrain))
print("test score:", m.score(Xtest, ytest))

train score: 0.7991004497751124
test score: 0.7982062780269058


### Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(n_estimators=20, max_depth=20, criterion="gini",verbose=False)

In [22]:
# fit data
m.fit(Xtrain, ytrain. values.ravel())
print("train score:", m.score(Xtrain, ytrain))
print("test  score:", m.score(Xtest, ytest))

train score: 0.8845577211394303
test  score: 0.7982062780269058


#### Random Forest Classifier as function

In [23]:
rand__forest(5,5)

0.8275862068965517


#### Split Data

In [24]:
X = train_df[["Pclass","Sex","Age", "SibSp", "Parch", "Fare", "C", "Q", "S"]]
y = train_df[["Survived"]]

print("len:", len(X) == len(y))
Xtrain, Xtest, ytrain, ytest=train_test_split(X,y, random_state=40)
print("Xtrain", Xtrain.shape, "Xtest", Xtest.shape, "ytrain", ytrain.shape,
      "ytest", ytest.shape)

len: True
Xtrain (667, 9) Xtest (223, 9) ytrain (667, 1) ytest (223, 1)


### Random Forrest

In [25]:
m = RandomForestClassifier(n_estimators=20, max_depth=20, criterion="gini")

In [26]:
m.fit(Xtrain, ytrain.values.ravel())
m.score(Xtrain, ytrain)

0.9700149925037481

#### LogisticRegression


In [27]:
#m = LogisticRegression()
# create and configure model
m = LogisticRegression(solver='lbfgs', max_iter=400)
# create and configure model
#m = LogisticRegression(solver='lbfgs')
m.fit(Xtrain, ytrain.values.ravel())
#m.score(X_train,y_train)

print("train score: ", round(m.score(Xtrain, ytrain) * 100, 2), "%")
print("test score:  ", round(m.score(Xtest, ytest) * 100, 2), "%")
#print("train score: " ,m.score(Xtrain,ytrain))
#print("test score: ", m.score(Xtest, ytest))

train score:  80.51 %
test score:   80.27 %


### accuracy_score, precision_score, recall_score

In [44]:
print('accuracy score: ', accuracy_score(ytrain, ypred))
print('precision score: ', precision_score(ytrain, ypred))
print('recall score: ', recall_score(ytrain, ypred))

accuracy score:  0.8020989505247377
precision score:  0.7741935483870968
recall score:  0.7164179104477612


#### Cross Validation

In [28]:
#cross validation
m = LogisticRegression(solver='lbfgs',max_iter=400)

scores = cross_val_score(m, Xtrain, ytrain.values.ravel(), cv=5)
#scores
print("score mean:", scores.mean())
print("score std:", scores.std())


score mean: 0.7929809258560241
score std: 0.022673568727314727


#### Adding more columns to input data

In [29]:
X = train_df[[
    "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Title", "C", "Q", "S"
]]

y = train_df[["Survived"]]

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

print("len:", len(X) == len(y))
print("Xtrain", Xtrain.shape, "Xtest", Xtest.shape, "ytrain", ytrain.shape,
      "ytest", ytest.shape)

len: True
Xtrain (667, 10) Xtest (223, 10) ytrain (667, 1) ytest (223, 1)


## Different Methods

#### Stochastic Gradient Descent (SGD)

In [30]:
sgd = linear_model.SGDClassifier(max_iter=400, tol=1e-3)
sgd.fit(Xtrain, ytrain.values.ravel())
Y_pred = sgd.predict(Xtest)

sgd.score(Xtrain, ytrain)

acc_sgd = round(sgd.score(Xtrain, ytrain) * 100, 2)
print("acc_sgd:", acc_sgd, "%")

acc_sgd: 76.16 %


#### Random Forest

In [31]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(Xtrain, ytrain.values.ravel())

Y_prediction = random_forest.predict(Xtest)

random_forest.score(Xtrain, ytrain)
acc_random_forest = round(random_forest.score(Xtrain, ytrain) * 100, 2)
print("acc_random_forest:",acc_random_forest, "%")


acc_random_forest: 98.5 %


#### Logistic Regression

In [32]:
# example of LogisticRegression that generates a FutureWarning

# create and configure model
# create and configure model
m = LogisticRegression(solver='lbfgs', multi_class='auto',max_iter=400)
#m = LogisticRegression(solver='lbfgs', multi_class='ovr')
m.fit(Xtrain, ytrain.values.ravel())

Y_pred = m.predict(Xtest)

acc_log = round(m.score(Xtrain, ytrain) * 100, 2)
print("acc_log_reg:",acc_log, "%")

acc_log_reg: 80.21 %


#### K Nearest Neighbor:

In [33]:
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(Xtrain, ytrain.values.ravel())  
Y_pred = knn.predict(Xtest)
acc_knn = round(knn.score(Xtrain, ytrain) * 100, 2)
print("acc_knn:",acc_knn, "%")

acc_knn: 85.91 %


#### Gaussian Naive Bayes:

In [34]:
gaussian = GaussianNB() 
gaussian.fit(Xtrain, ytrain.values.ravel())  
Y_pred = gaussian.predict(Xtest)  
acc_gaussian = round(gaussian.score(Xtrain, ytrain) * 100, 2)
print("acc_gaussian:",acc_gaussian, "%")

acc_gaussian: 80.06 %


#### Perceptron

In [35]:
perceptron = Perceptron(max_iter=400,tol=1e-3)
perceptron.fit(Xtrain, ytrain.values.ravel())

Y_pred = perceptron.predict(Xtest)

acc_perceptron = round(perceptron.score(Xtrain, ytrain) * 100, 2)
print("acc_perceptron:",acc_perceptron, "%")

acc_perceptron: 72.71 %


#### Linear Support Vector Machine:

In [36]:
linear_svc = LinearSVC(max_iter=2000, tol=1e-3)
linear_svc.fit(Xtrain, ytrain.values.ravel())

Y_pred = linear_svc.predict(Xtest)

acc_linear_svc = round(linear_svc.score(Xtrain, ytrain) * 100, 2)
print("acc_linear_svc:",acc_linear_svc, "%")

acc_linear_svc: 78.26 %




#### Decision Tree

In [37]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(Xtrain, ytrain)  
Y_pred = decision_tree.predict(Xtest)  
acc_decision_tree = round(decision_tree.score(Xtrain, ytrain) * 100, 2)
print("acc_decision_tree:",acc_decision_tree, "%")

acc_decision_tree: 98.5 %


#### Which is the best model?

In [38]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
98.5,Random Forest
98.5,Decision Tree
85.91,KNN
80.21,Logistic Regression
80.06,Naive Bayes
78.26,Support Vector Machines
76.16,Stochastic Gradient Decent
72.71,Perceptron


#### LogisticRegression as function

In [39]:
log_reg()

train score: 0.8020989505247377
test score: 0.7713004484304933


#### accuracy_score, precision_score, recall_score as function

In [45]:
ypred = m.predict(Xtrain)
print(ypred)

print('accuracy score: ', accuracy_score(ytrain, ypred))
print('precision score: ', precision_score(ytrain, ypred))
print('recall score: ', recall_score(ytrain, ypred))

[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1
 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1
 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0
 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1
 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1
 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0
 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 1
 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1
 1 1 1 0 1 0 0 1 0 0 0 1 

#### Random Forest as function // different values

In [41]:
#1st value n_estimators || 2nd value max_depth
rand__forest(1,1)
rand__forest(10,20)
rand__forest(20,20)
rand__forest(100,100)

0.7886056971514243
0.967016491754123
0.9775112443778111
0.9850074962518741
