In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import operator

In [148]:
#on importe les données
reduced_test = pd.read_csv("../../../data/reduced_test.csv")
reduced_truncated_data = pd.read_csv("../../../data/reduced_truncated_data.csv")
accuracy_group_to_predict = pd.read_csv("../../../data/accuracy_group_to_predict.csv")

In [149]:
#on ne garde que les colonnes communes, afin que le modèle puisse s'appliquer sur l'un puis sur l'autre
reduced_truncated_data = reduced_truncated_data[reduced_truncated_data.columns.intersection(reduced_test.columns)]
reduced_test = reduced_test[reduced_truncated_data.columns.intersection(reduced_test.columns)]

In [150]:
#On obtient Y contenant l'accuracy_group pour chaque ligne de reduced_truncated_data
Y = pd.merge(reduced_truncated_data, accuracy_group_to_predict, on='installation_id', how='outer')
Y = Y[["installation_id","accuracy_group_y"]]
Y.columns = ['installation_id', 'accuracy_group']

In [152]:
#on regarde et enlève dans Y et reduced_truncates_data les lignes ou au moins l'un des deux a un NA
print(pd.isnull(reduced_truncated_data).any(axis=1).any(axis=0)) #aucun NA
print(pd.isnull(Y).any(axis=1).any(axis=0)) #au moins un NA

rows_to_keep = list(map(operator.not_,pd.isnull(Y[Y.columns[1]])))
Y = Y[rows_to_keep]
reduced_truncated_data = reduced_truncated_data[rows_to_keep]

print(np.shape(reduced_truncated_data))

False
True
(16041, 884)


In [153]:
print(np.shape(Y))
print(np.shape(reduced_truncated_data))
print(np.shape(reduced_test))
print(np.shape(accuracy_group_to_predict))
Y.head()

(16041, 2)
(16041, 884)
(1000, 884)
(3302, 3)


Unnamed: 0,installation_id,accuracy_group
0,0006a69f,3.0
1,0006a69f,3.0
2,0006a69f,3.0
3,0006a69f,3.0
4,0006a69f,3.0


In [154]:
#il faut enlever les colonnes non numériques 
msk = reduced_truncated_data.dtypes == np.object
print(reduced_truncated_data.loc[:,msk].columns)

X = reduced_truncated_data.copy()
del X['installation_id']

Y2 = Y.copy()
del Y2['installation_id']

print(np.shape(X))
print(np.shape(reduced_truncated_data))

Index(['installation_id'], dtype='object')
(16041, 883)
(16041, 884)


In [155]:
model = LinearRegression().fit(X, Y2)

In [156]:
y_pred = model.predict(X)
y_pred = np.around(y_pred)
print(y_pred)

#les valeurs sont-elles toutes 0,1,2 ou3 ? Si oui, on les transforme en 0 ou 3
msk = y_pred>3
print(y_pred[msk])
y_pred[msk] = 3

msk = y_pred<0
print(y_pred[msk])
y_pred[msk] = 0

[[3.]
 [2.]
 [3.]
 ...
 [1.]
 [1.]
 [1.]]
[4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
[-1. -1. -1. -1. -1. -2. -1. -1. -2. -1. -1. -1. -1. -1. -2. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1.]


In [157]:
#Par rapport aux vraies valeurs, quelle est notre proportion de bonnes valeurs
compar = (Y2 == y_pred)
compar[compar.columns[0]].value_counts()
#Rq : ici, on a un accuracy_group par ligne (par action) alors qu'on veut seulement l'accuracy_group pour la dernière action

False    9966
True     6075
Name: accuracy_group, dtype: int64

In [158]:
#on récupère les installations id qu'on avait dû enlever pour lancer l'apprentissage
install_ids = pd.DataFrame(reduced_truncated_data['installation_id']).reset_index(drop=True)

#on obtient l'accuracy_group par ligne, avec à chaque fois l'installation_id
y_pred2 = pd.concat([install_ids,pd.DataFrame(y_pred)], axis=1)
y_pred2.columns = ['installation_id', 'accuracy_group']

In [159]:
Y.head()

Unnamed: 0,installation_id,accuracy_group
0,0006a69f,3.0
1,0006a69f,3.0
2,0006a69f,3.0
3,0006a69f,3.0
4,0006a69f,3.0


In [160]:
#On récupère maintenant l'accuracy correspondant à la dernière ligne de chaque installation_id, c'est-à-dire à l'assessment à prédire
def accuracy_by_installation_id(y_pred2):
    unique_id = np.unique(y_pred2['installation_id'])
    
    new_data = pd.DataFrame(columns = ["installation_id","accuracy_group"])
    
    for id in unique_id:
        #last line, so the assessment we want to predict
        last_truncated_id = y_pred2[y_pred2['installation_id'] == id].tail(1)

        #Update new_data
        new_data = pd.concat([new_data, last_truncated_id])
    return(new_data.reset_index(drop=True))


y_pred_final = accuracy_by_installation_id(y_pred2)
Y_final = accuracy_by_installation_id(Y)

In [162]:
#Enfin, on compare l'accruacy_group pour chaque assessment qu'on avait à prédire
compar = (Y_final == y_pred_final)
compar[compar.columns[0]].value_counts()

True    3302
Name: installation_id, dtype: int64

In [None]:
#On a donc que des réussites, ce qui est rassurant car ça veut dire que l'entrainement s'est bien passé
#Toutefois, il y a éventuellement un risque de surapprentissage
#Passons maintenant aux données à prédire (reduced_test)

In [163]:
#il faut enlever les colonnes non numériques 
msk = reduced_test.dtypes == np.object
print(reduced_test.loc[:,msk].columns)

X_test = reduced_test.copy()
del X_test['installation_id']

Index(['installation_id'], dtype='object')


In [164]:
y_pred_test = model.predict(X_test)
y_pred_test = np.around(y_pred_test)

#les valeurs sont-elles toutes 0,1,2 ou3 ? Si oui, on les transforme en 0 ou 3
msk = y_pred_test>3
print(y_pred_test[msk])
y_pred_test[msk] = 3

msk = y_pred_test<0
print(y_pred_test[msk])
y_pred_test[msk] = 0

[[  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  2.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  2.]
 [  1.]
 [  3.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [ -1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  2.]
 [  2.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  3.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [ -1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [ -5.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  2.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]
 [  1.]
 [  2.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  1.]
 [  2.]


In [165]:
#on récupère les installations id qu'on avait dû enlever pour lancer l'apprentissage
install_ids = pd.DataFrame(reduced_test['installation_id']).reset_index(drop=True)

#on obtient l'accuracy_group par ligne, avec à chaque fois l'installation_id
y_pred_test2 = pd.concat([install_ids,pd.DataFrame(y_pred_test)], axis=1)
y_pred_test2.columns = ['installation_id', 'accuracy_group']

In [172]:
y_pred_test_final = accuracy_by_installation_id(y_pred_test2)
y_pred_test_final["accuracy_group"] = y_pred_test_final["accuracy_group"].astype(int)
print(y_pred_test_final.head()

    installation_id  accuracy_group
0          00abaee7               1
1          01242218               1
2          017c5718               1
3          01a44906               1
4          01bc6cb6               2
5          02256298               2
6          0267757a               1
7          027e7ce5               2
8          02a29f99               1
9          0300c576               1
10         03885368               1
11         03ac279b               1
12         03e33699               2
13         048e7427               1
14         04a7bc3f               1
15         04d31500               2
16         0500e23b               1
17         0512bf0e               1
18         0525589b               1
19         05488e26               1
20         05771bba               2
21         05b82cf5               2
22         05e17e19               1
23         0617500d               3
24         068ae11f               1
25         0754f13b               2
26         07749e99         

In [173]:
y_pred_test_final.to_csv(r"../../../data/linear_reg_submission.csv", index = False)