In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
mpl.style.use('seaborn')
from matplotlib import pyplot as plt


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
train_set = pd.read_csv("preprocessed_train_2.csv", index_col="index")

train_set["change_type"] = train_set["change_type"].map({'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5})

In [4]:
train_set['change_type'] = train_set['change_type'].apply(lambda x: 4 if x == 5 else x)

In [5]:
train_set['change_type'].value_counts()

2    164120
3     99462
0     29738
1     15020
4      1666
Name: change_type, dtype: int64

In [6]:
X = train_set.drop(columns=["change_type"])
y = train_set["change_type"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

<h1> Models

<h2> LightGBM Based Models

In [6]:
from lightgbm import LGBMClassifier

model = LGBMClassifier().fit(X_train, y_train)

In [9]:
print(classification_report(model.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.93      0.76      0.84      9031
           1       0.64      0.77      0.70      3200
           2       0.82      0.80      0.81     41934
           3       0.64      0.69      0.66     23221
           4       0.07      0.25      0.11       116

    accuracy                           0.76     77502
   macro avg       0.62      0.65      0.62     77502
weighted avg       0.77      0.76      0.76     77502



In [8]:
model2 = LGBMClassifier(class_weight="balanced").fit(X_train, y_train)
print(classification_report(model2.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.95      0.73      0.83      9690
           1       0.85      0.47      0.61      6818
           2       0.68      0.86      0.76     32566
           3       0.63      0.63      0.63     24894
           4       0.65      0.07      0.13      3534

    accuracy                           0.70     77502
   macro avg       0.75      0.55      0.59     77502
weighted avg       0.71      0.70      0.68     77502



<h3> Model test

In [7]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat,average='macro'), True

In [8]:
from sklearn.metrics import recall_score

def lgb_recall_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'recall', recall_score(y_true, y_hat,average='macro'), True

In [9]:
train_set['change_type'] = train_set['change_type'].apply(lambda x: 0 if x in [0, 1, 2, 3] else 1)

X = train_set.drop(columns=["change_type"])
y = train_set["change_type"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
y.value_counts()

0    308340
1      1666
Name: change_type, dtype: int64

In [11]:
model3 = LGBMClassifier(metric = "lgb_recall_score", class_weight={0:2, 1:10}).fit(X_train, y_train)
print(classification_report(model3.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     77187
           1       0.16      0.23      0.19       315

    accuracy                           0.99     77502
   macro avg       0.58      0.61      0.59     77502
weighted avg       0.99      0.99      0.99     77502



In [24]:
model4 = LGBMClassifier(class_weight={0:100, 1:1}).fit(X_train, y_train)
print(classification_report(model4.predict(X_test), y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     77428
           1       0.01      0.08      0.02        74

    accuracy                           0.99     77502
   macro avg       0.51      0.54      0.51     77502
weighted avg       1.00      0.99      1.00     77502



Bon il faut se démerder pour augmenter le recall de la classe 4-5 de ouf sans chuter la précision...

<h2> Keras Based Model

In [22]:
from keras import backend as K

def f1_weighted(true, pred): #shapes (batch, 4)

    return 1 - f1_score(true, pred) #for metrics, return only 'weighted_f1'

In [30]:
from keras import Sequential
from keras.layers import Dense

model_net = Sequential([
    Dense(512, activation = "sigmoid", input_shape = [np.shape(X_train)[1]]),
    Dense(512, activation = "sigmoid"),
    Dense(256, activation = "sigmoid"),
    Dense(5, activation = "sigmoid")
])

model_net.compile(optimizer = "rmsprop", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [31]:
history = model_net.fit(X_train, y_train, batch_size = 256, epochs = 10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
np.argmax(model_net.predict(X_test), axis = 1)

array([3, 2, 2, ..., 2, 3, 2], dtype=int64)

In [32]:
print(classification_report(np.argmax(model_net.predict(X_test), axis = 1), y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     77502
           1       0.00      0.00      0.00         0

    accuracy                           0.99     77502
   macro avg       0.50      0.50      0.50     77502
weighted avg       1.00      0.99      1.00     77502



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
confusion_matrix(np.argmax(model_net.predict(X_test), axis = 1), y_test)

array([[77061,   441],
       [    0,     0]], dtype=int64)

In [35]:
y_test.value_counts()

0    77061
1      441
Name: change_type, dtype: int64

<h2> New complex model

In [11]:
from lightgbm import LGBMClassifier

In [8]:
y_train.value_counts()

2    123333
3     74508
0     22175
1     11236
4      1252
Name: change_type, dtype: int64

In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

def new_sampling(X, y, strategy):
    strat_under = {}

    for cat, nb in strategy.items():
        if nb < np.unique(y, return_counts=True)[1][cat]:
            strat_under[cat] = nb
        else : 
            strat_under[cat] = np.unique(y, return_counts=True)[1][cat]
    X, y = RandomUnderSampler(sampling_strategy=strat_under).fit_resample(X, y)
    X, y = RandomOverSampler(sampling_strategy=strategy).fit_resample(X, y)

    return X, y

In [9]:
# First we are going to make some Under and Over Sampling

strategy = {0 : 40000, 1 : 40000, 2 : 70000, 3 : 50000, 4 : 30000} # L'adapter pour ne pas le faire à la main !!!

X_train_new, y_train_new = new_sampling(X_train, y_train, strategy)

In [14]:
model_comp = LGBMClassifier(class_weight={0:1, 1:3, 2:2, 3:3, 4:17}).fit(X_train, y_train)
print(classification_report(model_comp.predict(X_test), y_test))

# IDEE : Faire un GridSearch sur les class_weight
# C'est pas trop mal en vraiii

              precision    recall  f1-score   support

           0       0.87      0.78      0.82      8399
           1       0.64      0.77      0.70      3153
           2       0.73      0.84      0.78     35528
           3       0.75      0.63      0.68     29707
           4       0.32      0.19      0.24       715

    accuracy                           0.74     77502
   macro avg       0.66      0.64      0.65     77502
weighted avg       0.75      0.74      0.74     77502



In [16]:
print(confusion_matrix(model_comp.predict(X_test), y_test))

[[ 6551    21  1320   498     9]
 [   24  2427   205   489     8]
 [  321   204 29945  4959    99]
 [  655  1102  9131 18655   164]
 [   12    30   186   353   134]]
