In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns


## Classificatie

In [2]:
#Inlezen van de dataset
dataset = pd.read_csv('diabetes.csv')
dataset= dataset[dataset["Insulin"] > 0]
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1


In [3]:
features = list(dataset.columns[:8])
X = dataset[features].values 
y= dataset['Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state =0)
print(X_test.shape)

(119, 8)


In [4]:
# Random Forest Classifier
number_of_trees = 1000
max_number_of_features = 2

RFCmodel = RandomForestClassifier(n_estimators=number_of_trees, max_features=max_number_of_features)


RFCmodel.fit(X_train,y_train)

print(RFCmodel.feature_importances_)

[0.08290801 0.23188339 0.07733118 0.08973489 0.15615375 0.11310049
 0.10610878 0.14277952]


In [5]:
# Test Random Forest Classifier

y_pred = RFCmodel.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 

              precision    recall  f1-score   support

           0       0.81      0.93      0.87        76
           1       0.84      0.60      0.70        43

   micro avg       0.82      0.82      0.82       119
   macro avg       0.82      0.77      0.78       119
weighted avg       0.82      0.82      0.81       119

[[71  5]
 [17 26]]
81.5126050420168


In [27]:
# Bagging met logistic regression

number_of_estimators = 100
complexity = 10
cart = LogisticRegression(C=complexity,solver='liblinear')


lregbagging = BaggingClassifier(base_estimator=cart, n_estimators=number_of_estimators)

lregbagging.fit(X_train,y_train)


y_pred = lregbagging.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 


              precision    recall  f1-score   support

           0       0.81      0.96      0.88        76
           1       0.90      0.60      0.72        43

   micro avg       0.83      0.83      0.83       119
   macro avg       0.85      0.78      0.80       119
weighted avg       0.84      0.83      0.82       119

[[73  3]
 [17 26]]
83.19327731092437


In [45]:
# Adaboost

clf_adaboost = AdaBoostClassifier(n_estimators=150,learning_rate=0.9)
clf_adaboost.fit(X_train,y_train)

y_pred = clf_adaboost.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 


              precision    recall  f1-score   support

           0       0.82      0.93      0.87        76
           1       0.84      0.63      0.72        43

   micro avg       0.82      0.82      0.82       119
   macro avg       0.83      0.78      0.80       119
weighted avg       0.83      0.82      0.82       119

[[71  5]
 [16 27]]
82.35294117647058


In [48]:
# Adaboost met logistic regression classifier

cart = LogisticRegression(C=complexity,solver='liblinear')
logreg_adaboost = AdaBoostClassifier(base_estimator=cart,n_estimators=150,learning_rate=0.9) 
logreg_adaboost.fit(X_train,y_train)

y_pred = logreg_adaboost.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 



              precision    recall  f1-score   support

           0       0.81      0.97      0.89        76
           1       0.93      0.60      0.73        43

   micro avg       0.84      0.84      0.84       119
   macro avg       0.87      0.79      0.81       119
weighted avg       0.85      0.84      0.83       119

[[74  2]
 [17 26]]
84.03361344537815


In [62]:
# Gradient boosting

clf_gradientboost = GradientBoostingClassifier(n_estimators=150,learning_rate=0.8)

clf_gradientboost.fit(X_train,y_train)


y_pred = clf_gradientboost.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 



              precision    recall  f1-score   support

           0       0.80      0.95      0.87        76
           1       0.86      0.58      0.69        43

   micro avg       0.82      0.82      0.82       119
   macro avg       0.83      0.76      0.78       119
weighted avg       0.82      0.82      0.80       119

[[72  4]
 [18 25]]
81.5126050420168


In [67]:
#### Classificatie met XGBoost

from xgboost import XGBClassifier

clf_xgb = XGBClassifier(n_estimators=150)
clf_xgb.fit(X_train,y_train)

y_pred = clf_xgb.predict(X_test)

print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 



              precision    recall  f1-score   support

           0       0.82      0.92      0.87        76
           1       0.82      0.65      0.73        43

   micro avg       0.82      0.82      0.82       119
   macro avg       0.82      0.79      0.80       119
weighted avg       0.82      0.82      0.82       119

[[70  6]
 [15 28]]
82.35294117647058


## Regressie

### Split criterium

In [3]:
dataset = pd.read_csv('Golf.csv')
dataset.head(14)

Unnamed: 0,Day,Outlook,Temp,Humidity,Wind,Players
0,1,Sunny,Hot,High,Weak,25
1,2,Sunny,Hot,High,Strong,30
2,3,Overcast,Hot,High,Weak,46
3,4,Rain,Mild,High,Weak,45
4,5,Rain,Cool,Normal,Weak,52
5,6,Rain,Cool,Normal,Strong,23
6,7,Overcast,Cool,Normal,Strong,43
7,8,Sunny,Mild,High,Weak,35
8,9,Sunny,Cool,Normal,Weak,38
9,10,Rain,Mild,Normal,Weak,46


In [13]:
# Splitsen volgens Outlook

# Variantie van het aantal spelers voor splitsing

var_tot = dataset['Players'].var()
print('De totale variantie bedraagt:', var_tot)

# Splitsen volgens outlook (Sunny, Overcast en Rain)
var_sunny = dataset[dataset['Outlook']=='Sunny'].Players.var()
nr_sunny = dataset[dataset['Outlook']=='Sunny'].Players.count()
print('Sunny:','variantie:',var_sunny,' aantal:',nr_sunny)

var_overcast = dataset[dataset['Outlook']=='Overcast'].Players.var()
nr_overcast = dataset[dataset['Outlook']=='Overcast'].Players.count()
print('Overcast:','variantie:',var_overcast,' aantal:',nr_overcast)

var_rain = dataset[dataset['Outlook']=='Rain'].Players.var()
nr_rain = dataset[dataset['Outlook']=='Rain'].Players.count()
print('Rain:','variantie:',var_sunny,' aantal:',nr_sunny)

var_comb = ((nr_sunny*var_sunny) + (nr_overcast*var_overcast) + (nr_rain*var_rain))/dataset['Players'].count()

print('Gewogen variantie na de split:', var_comb)

# Splitsen volgens wind

var_weak = dataset[dataset['Wind']=='Weak'].Players.var()
nr_weak = dataset[dataset['Wind']=='Weak'].Players.count()
print('weak wind:','variantie:',var_weak,' aantal:',nr_weak)

var_strong = dataset[dataset['Wind']=='Strong'].Players.var()
nr_strong = dataset[dataset['Wind']=='Strong'].Players.count()
print('strong wind:','variantie:',var_strong,' aantal:',nr_strong)

var_comb = ((nr_weak*var_weak) + (nr_strong*var_strong))/dataset['Players'].count()
print('Gewogen variantie na de split:', var_comb)

# Predictie op basis van enkel criterium outlook

print('gemiddelde bij zonnig weer:', dataset[dataset['Outlook']=='Sunny'].Players.mean() )
print('gemiddelde bij bewolkt weer:', dataset[dataset['Outlook']=='Overcast'].Players.mean() )
print('gemiddelde bij regen:', dataset[dataset['Outlook']=='Rain'].Players.mean() )

De totale variantie bedraagt: 93.56593406593407
Sunny: variantie: 75.69999999999999  aantal: 5
Overcast: variantie: 16.25  aantal: 4
Rain: variantie: 75.69999999999999  aantal: 5
Gewogen variantie na de split: 84.42857142857143
weak wind: variantie: 70.83928571428571  aantal: 8
strong wind: variantie: 134.66666666666669  aantal: 6
Gewogen variantie na de split: 98.1938775510204
gemiddelde bij zonnig weer: 35.2
gemiddelde bij bewolkt weer: 46.25
gemiddelde bij regen: 39.2


### Regressie via decision trees op de Bostong housing dataset

In [15]:
dataset = pd.read_csv('boston_housing.csv')

In [16]:
dataset.drop('CHAS', axis=1, inplace=True)
# Dataset opsplitsen in training en test set

from scipy import stats
dataset = dataset[(np.abs(stats.zscore(dataset)) < 3).all(axis=1)]
dataset.describe()

features = list(dataset.columns[:dataset.columns.size-1])
print(features)
X = dataset[features].values 
y= dataset['Price'].values

# Opdelen in training en test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PT', 'B', 'LSTAT']


In [17]:
from sklearn.preprocessing import PolynomialFeatures


# Aanmaken van de hogere orde features
graad = 2

poly = PolynomialFeatures(graad)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
print('dimensie van X_train_poly: ',X_train_poly.shape)
print('dimensie van X_test_poly: ',X_test_poly.shape)


# met L2 regularisatie via Ridge regression
lregmodel_poly = Ridge(alpha=5,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)

print('R2 score op test set via L2: ',lregmodel_poly.score(X_test_poly,y_test))
# R2 -score via L2 op de trainingset
print('R2 score op training set via L2: ',lregmodel_poly.score(X_train_poly,y_train))



# met L1 regularisatie via Lasso regression
lregmodel_poly = Lasso(alpha=3,tol=0.0001,fit_intercept=True)
lregmodel_poly.fit(X_train_poly,y_train)      
  
      
print('R2 score op test set via L1: ',lregmodel_poly.score(X_test_poly,y_test))
  
# R2 -score via L1 op de trainingset
print('R2 score op training set via L1: ',lregmodel_poly.score(X_train_poly,y_train)) 

dimensie van X_train_poly:  (295, 91)
dimensie van X_test_poly:  (146, 91)
R2 score op test set via L2:  0.8280172944539267
R2 score op training set via L2:  0.896136969947186
R2 score op test set via L1:  0.8592830757971259
R2 score op training set via L1:  0.8761987185666626




In [18]:
# Random Forest Regressor

RFR_model = RandomForestRegressor(n_estimators=150)
RFR_model.fit(X_train_poly,y_train)

RFR_model.score(X_test_poly,y_test)

0.8974988236096475