In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import operator

In [40]:
#on importe les données
reduced_test = pd.read_csv("../../../data/reduced_test.csv")
reduced_truncated_data = pd.read_csv("../../../data/reduced_truncated_data.csv")
accuracy_group_to_predict = pd.read_csv("../../../data/accuracy_group_to_predict.csv")

In [41]:
#on ne garde que les colonnes communes, afin que le modèle puisse s'appliquer sur l'un puis sur l'autre
reduced_truncated_data = reduced_truncated_data[reduced_truncated_data.columns.intersection(reduced_test.columns)]
reduced_test = reduced_test[reduced_truncated_data.columns.intersection(reduced_test.columns)]

In [42]:
#On obtient Y contenant l'accuracy_group pour chaque ligne de reduced_truncated_data
Y = pd.merge(reduced_truncated_data, accuracy_group_to_predict, on='installation_id', how='outer')
Y = Y["accuracy_group_y"]

In [43]:
print(np.shape(Y))
print(np.shape(reduced_truncated_data))
print(np.shape(reduced_test))
print(np.shape(accuracy_group_to_predict))
Y.head()

(17690,)
(17690, 884)
(1000, 884)
(3302, 3)


0    3.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: accuracy_group_y, dtype: float64

In [44]:
#on regarde et enlève dans Y et reduced_truncates_data les lignes ou au moins l'un des deux a un NA
print(pd.isnull(reduced_truncated_data).any(axis=1).any(axis=0)) #aucun NA
print(pd.isnull(Y).any(axis=0)) #au moins un NA

rows_to_keep = list(map(operator.not_,pd.isnull(Y)))
Y = Y[rows_to_keep]
reduced_truncated_data = reduced_truncated_data[rows_to_keep]

print(np.shape(reduced_truncated_data))

False
True
(16041, 884)


In [45]:
#il faut enlever les colonnes non numériques 
msk = reduced_truncated_data.dtypes == np.object
print(reduced_truncated_data.loc[:,msk].columns)

del reduced_truncated_data['installation_id']

Index(['installation_id'], dtype='object')


In [46]:
model = LinearRegression().fit(reduced_truncated_data, Y)

In [47]:
r_sq = model.score(reduced_truncated_data, Y)

In [64]:
y_pred = model.predict(reduced_truncated_data)
y_pred = np.around(y_pred)
print(y_pred)

#les valeurs sont-elles toutes 0,1,2 ou3 ? Si oui, on les transforme en 0 ou 3
msk = y_pred>3
print(y_pred[msk])
y_pred[msk] = 3

msk = y_pred<0
print(y_pred[msk])
y_pred[msk] = 0

[3. 2. 3. ... 1. 1. 1.]
[4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
[-1. -1. -1. -1. -1. -2. -1. -1. -2. -1. -1. -1. -1. -1. -2. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1.]
[]


In [67]:
#Par rapport aux vraies valeurs, quelle est notre proportion de bonnes valeurs
(Y == y_pred).value_counts()

False    9966
True     6075
Name: accuracy_group_y, dtype: int64