# ML-Projekt Demo
In diesem Notebook werden alle wichtigen Schritte eines ML-Projekts anhand des Titanic-Datensatzes demonstriert.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

pd.options.mode.chained_assignment = None  # avoid slide-copy-warning

### 1. Daten Laden und Analysieren

In [2]:
df = pd.read_csv('titanic.csv')
df.shape

(891, 12)

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
print(df['Survived'].value_counts())
print(df['Pclass'].value_counts())
print(df['Sex'].value_counts())
print(df["Embarked"].value_counts())

0    549
1    342
Name: Survived, dtype: int64
3    491
1    216
2    184
Name: Pclass, dtype: int64
male      577
female    314
Name: Sex, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


### 2. Kategorische Feature umwandeln

In [6]:
one_hot_Pclass = pd.get_dummies(df["Pclass"], prefix="Pclass")
one_hot_gender = pd.get_dummies(df["Sex"], prefix="Sex")
one_hot_embarked = pd.get_dummies(df["Embarked"], prefix="Embarked")

In [7]:
df_all = pd.concat([df, one_hot_Pclass, one_hot_gender, one_hot_embarked], axis=1)
df_all.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


### 3. Train-Test-Split

In [8]:
drop_cols = ["PassengerId", "Survived", "Pclass", "Name", "Sex", "Cabin", "Ticket", "Embarked"]
df_X = df_all.drop(columns=drop_cols)
df_y = df_all["Survived"]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=0)
X_train.head().T

Unnamed: 0,140,439,817,378,491
Age,,31.0,31.0,20.0,21.0
SibSp,0.0,0.0,1.0,0.0,0.0
Parch,2.0,0.0,1.0,0.0,0.0
Fare,15.2458,10.5,37.0042,4.0125,7.25
Pclass_1,0.0,0.0,0.0,0.0,0.0
Pclass_2,0.0,1.0,1.0,0.0,0.0
Pclass_3,1.0,0.0,0.0,1.0,1.0
Sex_female,1.0,0.0,0.0,0.0,0.0
Sex_male,0.0,1.0,1.0,1.0,1.0
Embarked_C,1.0,0.0,1.0,1.0,0.0


### 4. N/As auffüllen

In [9]:
X_train.isna().sum()

Age           141
SibSp           0
Parch           0
Fare            0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [10]:
age_mean = X_train['Age'].mean()
print(age_mean)
X_train['Age'] = X_train['Age'].fillna(age_mean)

29.745183887915935


In [11]:
X_train.isna().sum()

Age           0
SibSp         0
Parch         0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

### 5. Feature Scaling

In [12]:
scaler = StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

### 6. Model Training mit CV

In [13]:
parameter_candidates = [{'n_estimators': [60,80,100,120,140], 'max_depth': [2, 3, 4, 5]}]
gbt = GradientBoostingClassifier(random_state=0)
clf_cv = GridSearchCV(estimator=gbt, param_grid=parameter_candidates, n_jobs=-1)
clf_cv.fit(X_scaled, y_train)

GridSearchCV(estimator=GradientBoostingClassifier(random_state=0), n_jobs=-1,
             param_grid=[{'max_depth': [2, 3, 4, 5],
                          'n_estimators': [60, 80, 100, 120, 140]}])

In [14]:
print('Best n_estimators:', clf_cv.best_estimator_.n_estimators) 
print('Best max_depth:', clf_cv.best_estimator_.max_depth)

Best n_estimators: 120
Best max_depth: 3


In [15]:
clf = GradientBoostingClassifier(random_state=0, n_estimators=120, max_depth=3)
clf.fit(X_scaled, y_train)

GradientBoostingClassifier(n_estimators=120, random_state=0)

### 7. Evaluation

In [16]:
X_test = X_test.dropna()
y_test = y_test[X_test.index]

In [17]:
X_test_scaled = scaler.transform(X_test)
predictions = clf.predict(X_test_scaled)

In [18]:
print("Accuracy score is:")
accuracy_score(y_test, predictions)

Accuracy score is:


0.8601398601398601

In [19]:
print("Recall is:")
recall_score(y_test, predictions)

Recall is:


0.7627118644067796

In [20]:
print("Precision is:")
precision_score(y_test, predictions)

Precision is:


0.8823529411764706