# TP : Data preparation

La préparation des données est une étape fondamentale dans le cycle du machine learning. Selon les experts d’IBM, le data cleaning représentent près de 80% du temps des analystes et autres data scientists ! Des erreurs durant ces étapes peuvent mener à des mauvaux fonctionnement ou de mauvaises performances très difficiles à déboguer.

## Import des librairies

In [22]:
import pandas as pd # Gestion des données
import seaborn as sns # outil de visualisation
import matplotlib.pyplot as plt # Librairie de visualisation
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler # Outil de preprocessing


In [23]:
# Import des librairies de métriques, encoding, preprocessing
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_percentage_error as mape

# Import des différents modèles de machine learning
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
import xgboost as xgb
import lightgbm as lgbm

In [24]:
# Import les donnes parametres
df_params = pd.read_csv('parameters.csv')
df_params


Unnamed: 0,flight,speed,payload,altitude,date,local_time,route
0,1,4.0,0.0,25,2019-04-07,10:13,R5
1,2,4.0,0.0,50,2019-04-07,10:23,R5
2,3,6.0,0.0,25,2019-04-07,10:33,R5
3,4,8.0,0.0,25,2019-04-07,10:48,R5
4,5,4.0,0.0,25,2019-04-07,11:05,R2
...,...,...,...,...,...,...,...
204,275,8.0,500.0,25,2019-10-24,9:05,R1
205,276,10.0,500.0,25,2019-10-24,9:32,R1
206,277,10.0,500.0,25,2019-10-24,9:45,R1
207,278,10.0,500.0,25-50-100-25,2019-10-24,10:00,R7


In [25]:
# Import les donnes flights
df_flight = pd.read_csv('flights.csv')
df_flight


  df_flight = pd.read_csv('flights.csv')


Unnamed: 0,flight,time,wind_speed,wind_angle,battery_voltage,battery_current,position_x,position_y,position_z,orientation_x,...,angular_z,linear_acceleration_x,linear_acceleration_y,linear_acceleration_z,speed,payload,altitude,date,time_day,route
0,1,0.00,0.1,12.0,24.222174,0.087470,-79.782396,40.458047,269.332402,0.001772,...,0.006815,0.004258,-0.120405,-9.811137,4.0,0.0,25,2019-04-07,10:13,R5
1,1,0.20,0.1,3.0,24.227180,0.095421,-79.782396,40.458047,269.332056,0.001768,...,0.002034,0.006175,-0.116397,-9.810392,4.0,0.0,25,2019-04-07,10:13,R5
2,1,0.30,0.1,352.0,24.225929,0.095421,-79.782396,40.458047,269.333081,0.001768,...,-0.000874,0.002696,-0.128592,-9.809440,4.0,0.0,25,2019-04-07,10:13,R5
3,1,0.50,0.1,354.0,24.224678,0.095421,-79.782396,40.458047,269.334648,0.001775,...,0.002443,0.002024,-0.128271,-9.810159,4.0,0.0,25,2019-04-07,10:13,R5
4,1,0.60,0.1,359.0,24.210905,0.079518,-79.782396,40.458047,269.336178,0.001775,...,-0.006425,0.008271,-0.119890,-9.812125,4.0,0.0,25,2019-04-07,10:13,R5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257891,279,152.10,1.1,198.0,22.857437,0.095421,-79.782802,40.459018,271.560190,0.021382,...,0.009449,0.444553,-0.274965,-9.796700,10.0,0.0,25-50-100-25,2019-10-24,10:10,R7
257892,279,152.20,1.1,196.0,22.847422,0.095421,-79.782802,40.459018,271.571983,0.021383,...,-0.001755,0.451230,-0.240619,-9.793810,10.0,0.0,25-50-100-25,2019-10-24,10:10,R7
257893,279,152.41,1.2,189.0,22.856186,0.111325,-79.782802,40.459018,271.584533,0.021385,...,0.008545,0.443839,-0.274903,-9.796004,10.0,0.0,25-50-100-25,2019-10-24,10:10,R7
257894,279,152.60,1.1,187.0,22.854933,0.127228,-79.782802,40.459018,271.588050,0.021393,...,-0.001379,0.443880,-0.248434,-9.794703,10.0,0.0,25-50-100-25,2019-10-24,10:10,R7


In [None]:
df

In [26]:
df_flight['route'].unique()

array(['R5', 'R2', 'R3', 'R4', 'R1', 'A1', 'A2', 'A3', 'H', 'R6', 'R7'],
      dtype=object)

In [27]:
df_flight['altitude'].unique()

array([25, 50, 75, 100, 0, 3, '25', '50', '75', '100', '25-50-100-25'],
      dtype=object)

In [28]:
list(df_flight)

['flight',
 'time',
 'wind_speed',
 'wind_angle',
 'battery_voltage',
 'battery_current',
 'position_x',
 'position_y',
 'position_z',
 'orientation_x',
 'orientation_y',
 'orientation_z',
 'orientation_w',
 'velocity_x',
 'velocity_y',
 'velocity_z',
 'angular_x',
 'angular_y',
 'angular_z',
 'linear_acceleration_x',
 'linear_acceleration_y',
 'linear_acceleration_z',
 'speed',
 'payload',
 'altitude',
 'date',
 'time_day',
 'route']

In [29]:
len(list(df_flight))

28

In [31]:
df_test = df_flight

In [32]:
corr_list = df_test.corr(method='pearson', numeric_only=True)['power']
corr_list

flight                  -0.042970
time                    -0.190578
wind_speed               0.479005
wind_angle               0.083061
battery_voltage         -0.639479
battery_current          0.997022
position_x              -0.257301
position_y               0.257280
position_z               0.496799
orientation_x           -0.181636
orientation_y           -0.203067
orientation_z            0.159264
orientation_w            0.032400
velocity_x              -0.105972
velocity_y               0.337205
velocity_z               0.225852
angular_x               -0.005847
angular_y               -0.018338
angular_z                0.061403
linear_acceleration_x   -0.228275
linear_acceleration_y    0.072014
linear_acceleration_z   -0.100247
speed                    0.071127
payload                  0.171903
power                    1.000000
Name: power, dtype: float64

In [None]:
# Hypothèse : il n'y a pas d'influence de la position angulaire et de la vitesse angulaire sur la consommation électrique

Correlacao com power
Indice de voos(clusterizar?) = 'flight',
tempo em segundos = 'time', tem correlacao,
'wind_speed', tem correlacao,
'wind_angle', nao tem correlacao
'battery_voltage' tem correlacao,
'battery_current' tem correlacao,
'position_x' tem correlacao,
'position_y' tem correlacao,
'position_z' tem correlacao,
'orientation_x' nao tem correlacao,
'orientation_y' nao tem correlacao,
'orientation_z' nao tem correlacao,
'orientation_w' nao tem correlacao,
'velocity_x' tem correlacao,
'velocity_y' tem correlacao,
'velocity_z' tem correlacao,
'angular_x' nao tem correlacao,
'angular_y' nao tem correlacao,
'angular_z' nao tem correlacao,
'linear_acceleration_x' tem correlacao,
'linear_acceleration_y' tem correlacao,
'linear_acceleration_z' tem correlacao,
'speed' tem correlacao,
'payload' tem correlacao,
'altitude' nao tem correlacao,
'date'  tem correlacao,
'time_day' ,
'route' utile correlacao.

In [55]:
columns_2_rm = ['wind_angle', 'orientation_x','orientation_y','time_day','orientation_z','orientation_w','angular_x','angular_y' ,'angular_z', 'altitude']
df = df_flight.drop(columns=columns_2_rm, axis=1)
list(df)

['flight',
 'time',
 'wind_speed',
 'battery_voltage',
 'battery_current',
 'position_x',
 'position_y',
 'position_z',
 'velocity_x',
 'velocity_y',
 'velocity_z',
 'linear_acceleration_x',
 'linear_acceleration_y',
 'linear_acceleration_z',
 'speed',
 'payload',
 'date',
 'route',
 'power']

In [59]:
X = df
y = df['power']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)


In [61]:
X_train

Unnamed: 0,flight,time,wind_speed,battery_voltage,battery_current,position_x,position_y,position_z,velocity_x,velocity_y,velocity_z,linear_acceleration_x,linear_acceleration_y,linear_acceleration_z,speed,payload,date,route,power
236062,258,178.70,0.7,22.697176,22.535328,-79.782743,40.459054,285.069307,0.027567,-0.036216,-1.010734,0.155700,-0.025130,-9.782621,4.0,250.0,2019-08-05,R1,511.488303
133742,170,145.51,0.5,23.285639,0.246505,-79.782803,40.459033,268.371104,-0.050346,0.021282,-0.052412,-0.521039,0.138622,-9.785364,6.0,0.0,2019-07-09,R1,5.740026
170336,199,213.50,0.5,24.018089,-0.103373,-79.782805,40.459033,272.279071,-0.002270,0.012618,0.009100,0.203926,-0.292285,-9.801460,4.0,250.0,2019-07-15,R1,-2.482823
228265,252,68.60,13.8,22.136257,17.629084,-79.781455,40.459517,345.459152,0.153613,9.968023,-0.017441,-2.319635,0.181022,-9.990791,12.0,500.0,2019-08-05,R1,390.241929
34135,85,151.80,9.8,22.330324,14.392711,-79.782794,40.459011,368.130381,-0.009155,-0.075389,-1.469511,-1.254471,0.011219,-8.767081,10.0,0.0,2019-06-11,R1,321.393895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,158,96.80,11.4,21.333693,27.886871,-79.782476,40.458321,347.845865,-0.314320,11.361715,4.198701,-2.363998,0.129121,-9.993191,12.0,250.0,2019-07-03,R1,594.929939
103694,146,21.10,4.0,22.774803,26.224951,-79.782754,40.458956,288.014403,-0.103477,0.036878,2.682021,0.249293,-0.188426,-9.658655,10.0,0.0,2019-07-01,R1,597.268092
131932,169,57.80,7.1,23.742638,19.752207,-79.781780,40.459405,315.937927,-0.316184,5.938248,0.738611,-1.073939,0.011671,-10.014132,6.0,0.0,2019-07-09,R1,468.969489
146867,181,126.10,4.2,23.348242,18.177755,-79.782811,40.458930,309.661785,-0.103618,0.177358,-2.592193,0.172799,-0.084695,-9.875878,6.0,250.0,2019-07-10,R1,424.418628


In [62]:
X_test

Unnamed: 0,flight,time,wind_speed,battery_voltage,battery_current,position_x,position_y,position_z,velocity_x,velocity_y,velocity_z,linear_acceleration_x,linear_acceleration_y,linear_acceleration_z,speed,payload,date,route,power
104695,146,171.2,0.7,23.678783,-0.031807,-79.782770,40.458982,272.091840,0.002206,-0.004930,-0.007619,0.329245,-0.123493,-9.808667,10.0,0.0,2019-07-01,R1,-0.753153
247845,270,75.4,11.8,21.839521,20.563288,-79.781571,40.459265,338.458030,0.005843,10.066872,2.320265,-1.913836,0.900763,-9.158969,10.0,250.0,2019-09-19,R6,449.092363
78438,124,89.2,8.3,21.400051,22.694363,-79.782425,40.458106,291.703944,-0.659465,7.321277,2.789932,-1.310707,0.703210,-10.123527,8.0,0.0,2019-06-25,R1,485.660521
66762,114,124.7,5.4,22.615791,15.338972,-79.782803,40.458910,308.837353,-0.095876,-0.002004,-2.967188,0.367743,0.343481,-9.711419,10.0,0.0,2019-06-24,R1,346.902992
130519,168,26.6,3.6,21.461401,29.914574,-79.782800,40.458983,287.004793,0.069157,0.126321,3.026384,0.479494,-0.380542,-9.673614,10.0,250.0,2019-07-09,R1,642.008661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159367,191,109.1,12.7,20.913004,25.517242,-79.782542,40.458447,365.554930,-0.005163,12.218443,2.762213,-2.456932,0.003729,-9.005128,12.0,250.0,2019-07-15,R1,533.642191
161495,193,76.3,1.8,20.867929,26.312420,-79.781342,40.459578,316.994463,0.147167,-0.229235,0.354144,0.072773,-0.134515,-9.938290,4.0,500.0,2019-07-15,R1,549.085722
166029,196,124.5,1.5,22.245186,20.642805,-79.782772,40.459033,278.853075,-0.157131,-0.000604,-0.979230,0.083970,0.315918,-9.888404,12.0,250.0,2019-07-15,R1,459.203036
166635,196,215.4,0.5,23.663759,0.015904,-79.782759,40.459038,271.730090,0.000556,-0.002574,0.020313,-0.083440,0.289899,-9.804312,12.0,250.0,2019-07-15,R1,0.376338


In [63]:
y_train

236062    511.488303
133742      5.740026
170336     -2.482823
228265    390.241929
34135     321.393895
             ...    
119879    594.929939
103694    597.268092
131932    468.969489
146867    424.418628
121958    480.353862
Name: power, Length: 172790, dtype: float64

In [64]:
y_test

104695     -0.753153
247845    449.092363
78438     485.660521
66762     346.902992
130519    642.008661
             ...    
159367    533.642191
161495    549.085722
166029    459.203036
166635      0.376338
175809     -4.334873
Name: power, Length: 85106, dtype: float64