In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sqlfunctions import get_datapunten_voor_model


## Notities
Datapunten AI:
- omzet
- Beursgenoteerd of niet
- sector (nacebel code)
- aantal personeelsleden
- hoofdsteden provincies

In [2]:
# Zet alle data in pd dataframe
# dataframe.set_index('ondernemingnr', inplace=True)
# TODO: 
# - 'site_aanwezig', 'pdf_aanwezig' -> later toevoegen en kijken of dit de accuracy verbeterd
data = get_datapunten_voor_model()
dataframe = pd.DataFrame(data, columns=['ondernemingnr', 'omzet' ,'beursgenoteerd', 'sector', 'personeelsleden', 'hoofdsteden_provincies', 'site_aanwezig', 'pdf_aanwezig', 'score'])
dataframe

Unnamed: 0,ondernemingnr,omzet,beursgenoteerd,sector,personeelsleden,hoofdsteden_provincies,site_aanwezig,pdf_aanwezig,score
0,0458458325,12299679.0,False,Rust- en verzorgingstehuizen (R.V.T.),157.0,Roeselare,1,1,0.009328
1,0458458325,12299679.0,False,Rust- en verzorgingstehuizen (R.V.T.),157.0,Roeselare,1,1,0.000000
2,0449500473,8490472.0,False,Bouwmarkten en andere doe-het-zelfzaken in bou...,19.0,Antwerpen,0,1,0.005565
3,0449500473,8490472.0,False,Bouwmarkten en andere doe-het-zelfzaken in bou...,19.0,Antwerpen,0,1,0.004333
4,0449500473,8490472.0,False,Bouwmarkten en andere doe-het-zelfzaken in bou...,19.0,Antwerpen,0,1,0.000000
...,...,...,...,...,...,...,...,...,...
39837,0894952979,1857912.0,False,Detailhandel in andere motorvoertuigen (groter...,24.0,Maaseik,1,1,0.005810
39838,0894952979,1857912.0,False,Detailhandel in andere motorvoertuigen (groter...,24.0,Maaseik,1,1,0.000000
39839,0898270478,612383.0,False,"Teelt van groenten, meloenen en wortel- en kno...",14.0,Mechelen,1,1,0.003220
39840,0898270478,612383.0,False,"Teelt van groenten, meloenen en wortel- en kno...",14.0,Mechelen,1,1,0.004327


In [5]:
# score als sum van alle scores
dataframe = dataframe.groupby('ondernemingnr').agg({'omzet': 'first', 'beursgenoteerd': 'first', 'sector': 'first', 'personeelsleden': 'first', 'hoofdsteden_provincies': 'first', 'site_aanwezig': 'first', 'pdf_aanwezig': 'first', 'score': 'sum'}).reset_index()


In [None]:
dataframe.set_index('ondernemingnr', inplace=True)


In [10]:
print(dataframe["site_aanwezig"].value_counts())
print(dataframe["pdf_aanwezig"].value_counts())

# rows that both have a site and a pdf
len(dataframe[(dataframe['site_aanwezig'] == 1) & (dataframe['pdf_aanwezig'] == 1)])

1    9057
0    4224
Name: site_aanwezig, dtype: int64
1    12517
0      764
Name: pdf_aanwezig, dtype: int64


8498

In [None]:
# categorize sector
dataframe["sector"] = dataframe["sector"].astype('category')
# get amount of different sectors
sectors = dataframe["sector"].cat.categories
# how are the sectors distributed ?
dataframe["sector"].value_counts()


In [None]:
# Data preprocessing
from sklearn.preprocessing import OrdinalEncoder

dataframe["sector"] = dataframe["sector"].astype('category')
ordinal_encoder = OrdinalEncoder()
dataframe["sector"] = ordinal_encoder.fit_transform(dataframe[["sector"]])
dataframe["sector"] = dataframe["sector"].astype(int)


dataframe["beursgenoteerd"] = dataframe["beursgenoteerd"].astype(int)
len(dataframe[dataframe["beursgenoteerd"] == 1])
# Maar 17 beursgenoteerde bedrijven

dataframe["personeelsleden"] = dataframe["personeelsleden"].astype(int)

dataframe



In [None]:
dataframe["omzet"].value_counts()

In [None]:
# plot correlation matrix
import matplotlib.pyplot as plt
corr = dataframe.corr()
corr["score"].sort_values(ascending=False)
from pandas.plotting import scatter_matrix
attributes = ["score", "omzet", "personeelsleden", "beursgenoteerd"]
scatter_matrix(dataframe[attributes], figsize=(12, 8))
plt.show()

print(corr)



![white box ml modellen](https://drek4537l1klr.cloudfront.net/thampi/HighResolutionFigures/figure_2-2.png)


In [None]:
# Model schrijven
# ! Kan pas gerund worden vanaf we alle data hebben
X = dataframe.drop(columns=["score", "ondernemingnr"])
y = dataframe["score"]

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Decision tree regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
accuracy_score(y_test, y_pred)

# Logistic regression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

print(f"Accuracy Decision Tree: {accuracy_score(y_test, y_pred)} \n Accuracy Logistic Regression: {accuracy_score(y_test, y_pred)}")

