In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("../src/")
import data_preparation as dp
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder


## préparation des données

In [16]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

dataprep = dp.DataPreparation(df_train, df_test)
clean_train, clean_test = train_clean, test_clean = dataprep.prepare_data()

  df_train = pd.read_csv("../data/train.csv")
  df_test = pd.read_csv("../data/test.csv")


Valeurs manquantes du train supprimées ✅
Valeurs manquantes du test supprimées ✅
Variables renommées ✅
Valeurs manquantes numériques imputées ✅
Valeurs manquantes catégorielles imputées ✅


In [17]:
clean_train.to_pickle("../data/clean_train.p")
clean_test.to_pickle("../data/clean_test.p")

In [18]:
df_train.to_pickle("../data/train.p")
df_test.to_pickle("../data/test.p")

## Prédictions avec une RF
Métrique à minimiser : MAE \
Variable cible : Ewltp 

In [67]:
clean_train = pd.read_pickle("../data/clean_train.p")
clean_test = pd.read_pickle("../data/clean_test.p")

## Etape 1 : recherche des meilleures variables explicatives

### 1.1 : Encodage des variables catégorielles --> label encoding

In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cat_col = clean_train.select_dtypes(include=['object'])
for col in cat_col:
    clean_train[col] = encoder.fit_transform(clean_train[col])

In [13]:
clean_train.drop(columns=["date"], inplace=True)

In [15]:
clean_train.head()

Unnamed: 0,ID,Country,VFN,Mh,Man,Tan,T,Va,Ve,Mk,...,Ewltp (g/km),W (mm),At1 (mm),At2 (mm),Ft,Fm,ec (cm3),ep (KW),Fuel consumption,Electric range (km)
0,0,5,5007,33,35,4266,445,4099,1702,168,...,401.053306,2700.0,1571.0,1576.0,8,3,999.0,92.0,5.6,0.0
1,1,10,7077,92,100,3401,310,1126,10280,639,...,394.684459,2552.0,1500.0,1483.0,8,4,999.0,70.0,5.5,0.0
2,2,10,7066,92,100,3401,310,1126,8292,639,...,398.561951,2552.0,1500.0,1483.0,8,4,999.0,70.0,5.6,0.0
3,3,5,2324,47,49,5295,398,1629,6397,334,...,479.470056,2650.0,1555.0,1563.0,8,4,1591.0,150.0,6.8,0.0
4,4,10,7100,92,100,3401,310,1124,8299,639,...,421.85482,2552.0,1500.0,1483.0,8,4,999.0,81.0,5.9,0.0


### 1.2 : corrélation des variables

On va garder uniquement les variables avec une corrélation ≥ 0

In [37]:
correlations = clean_train.corr()["Ewltp (g/km)"]
positive_correlations = correlations[abs(correlations) > 0.2].index.tolist()
clean_train_filtered = clean_train[positive_correlations]

In [38]:
clean_train_filtered = clean_train_filtered.drop("")

Unnamed: 0,m (kg),Mt,Ewltp (g/km),Fm,Fuel consumption,Electric range (km)
0,1387.0,1534.000000,401.053306,3,5.600000,0.0
1,1172.0,1337.000000,394.684459,4,5.500000,0.0
2,1204.0,1335.000000,398.561951,4,5.600000,0.0
3,1438.0,1588.000000,479.470056,4,6.800000,0.0
4,1207.0,1350.000000,421.854820,4,5.900000,0.0
...,...,...,...,...,...,...
7571644,1815.0,2006.000000,481.847421,3,5.900000,0.0
7571645,1815.0,2028.000000,486.397483,3,5.900000,0.0
7571646,1815.0,2006.000000,487.868524,3,5.900000,0.0
7571647,1240.0,1341.530935,350.795221,0,6.200000,0.0


## Etape 2 : application du modèle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
X = clean_train_filtered.drop("Ewltp (g/km)", axis=1)
y = clean_train_filtered["Ewltp (g/km)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
print("split fait")
model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=0)
model.fit(X_train, y_train)
print("fit fait")

split fait
fit fait


In [45]:
y_pred = model.predict(X_test)
print("predict fait")
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

predict fait
Mean Absolute Error: 4.207929104611683


In [20]:
clean_train.columns

Index(['ID', 'Country', 'VFN', 'Mh', 'Man', 'Tan', 'T', 'Va', 'Ve', 'Mk', 'Cn',
       'Ct', 'Cr', 'm_(kg)', 'Mt', 'Ewltp_(g/km)', 'W_(mm)', 'At1_(mm)',
       'At2_(mm)', 'Ft', 'Fm', 'ec_(cm3)', 'ep_(KW)', 'Fuel_consumption_',
       'Electric_range_(km)'],
      dtype='object')

## encodage avec target encoding

In [21]:
from sklearn.preprocessing import TargetEncoder

encoder = TargetEncoder()
# clean_train.drop("Country", axis=1,inplace = True)
cat_col = clean_train.select_dtypes(include=['object'])
for col in cat_col:
    clean_train[col] = encoder.fit_transform(clean_train[col].values.reshape(-1,1), clean_train["Ewltp_(g/km)"])

In [23]:
correlations = clean_train.corr()["Ewltp_(g/km)"]
correlations

ID                    -0.005225
Country                0.322167
VFN                    0.981224
Mh                     0.416739
Man                    0.412072
Tan                    0.807614
T                      0.779785
Va                     0.967404
Ve                     0.969045
Mk                     0.406393
Cn                     0.886888
Ct                     0.125558
Cr                     0.125341
m_(kg)                -0.235782
Mt                    -0.200032
Ewltp_(g/km)           1.000000
W_(mm)                 0.053375
At1_(mm)               0.062441
At2_(mm)               0.062847
Ft                     0.896471
Fm                     0.895119
ec_(cm3)              -0.037571
ep_(KW)                0.033950
Fuel_consumption_      0.778745
Electric_range_(km)   -0.628966
Name: Ewltp_(g/km), dtype: float64

In [25]:
CORR_THRESHOLD = 0.6
correlations = clean_train.corr()["Ewltp_(g/km)"]
positive_correlations = correlations[abs(correlations) > CORR_THRESHOLD].index.tolist()
clean_train_filtered = clean_train[positive_correlations]

In [26]:
X = clean_train_filtered.drop("Ewltp_(g/km)", axis=1)
y = clean_train_filtered["Ewltp_(g/km)"]

In [27]:
print(X.columns)

Index(['VFN', 'Tan', 'T', 'Va', 'Ve', 'Cn', 'Ft', 'Fm', 'Fuel_consumption_',
       'Electric_range_(km)'],
      dtype='object')


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
print("split fait")
model_rf = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=0)

print("fit en cours... ⏳")
model_rf.fit(X_train, y_train)
print("fit terminé 🎉")


split fait
fit en cours... ⏳


KeyboardInterrupt: 

In [None]:
y_pred = model.predict(X_test)
print("predict fait")
mae = mean_absolute_error(y_test, y_pred)
print("MAE de la RF:", mae)

predict fait
Mean Absolute Error: 3.1391001105850145


---- NE PAS FAIRE .FIT SUR LE TEST MAIS JUSTE .TRANSFORM

## 3. Utilisation de XGBoost

In [36]:
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

encoder = TargetEncoder()
# clean_train.drop("Country", axis=1,inplace = True)
cat_col = clean_train.select_dtypes(include=['object'])
for col in cat_col:
    X_train[col] = encoder.fit_transform(X_train[col].values.reshape(-1,1), y_train)
    X_test[col] = encoder.transform(X_test[col].values.reshape(-1,1))

N_ESTIMATORS = 100
MAX_DEPTH = 10
N_ITERATION = 100
xgboost = GradientBoostingRegressor(n_estimators=100, max_depth=10,random_state=20)
print("fit en cours... ⏳")
xgboost.fit(X_train,y_train)
print("fit terminé 🎉")
print("predict en cours... ⏳")
y_pred = xgboost.predict(X_test)
print("predict effectué 🎉")
mae_xgb = mean_absolute_error(y_test, y_pred)
print(f"MAE de XBG : {mae_xgb}")

fit en cours... ⏳
fit terminé 🎉
predict en cours... ⏳
predict effectué 🎉
MAE de XBG : 3.671670211444859


In [43]:
N_ESTIMATORS = 100
MAX_DEPTH = 10
N_ITERATION = 100
parameters_dictionnary = {"Model" : "XGBoost", 
                          "N_ESTIMATORS" : N_ESTIMATORS, 
                          'MAX_DEPTH' : MAX_DEPTH,
                          "N_ITERATION" : N_ITERATION,
                          "MAE" : mae_xgb}

In [70]:
import os
import json
from joblib import dump

if not os.path.exists("../artefacts/"):
    os.makedirs("../artefacts/")

if not os.path.exists(f"../artefacts/XGB_{round(mae_xgb,2)}"):
    os.makedirs(f"../artefacts/XGB_{round(mae_xgb,2)}")

with open(f"../artefacts/XGB_{round(mae_xgb,2)}/parameters", "w") as json_file:
    json.dump(parameters_dictionnary, json_file)

dump(xgboost, f"../artefacts/XGB_{round(mae_xgb,2)}/RF.joblib")

['../artefacts/XGB_3.67/RF.joblib']

In [45]:
clean_test

Unnamed: 0,ID,Country,VFN,Mh,Man,Tan,T,Va,Ve,Mk,...,Mt,W_(mm),At1_(mm),At2_(mm),Ft,Fm,ec_(cm3),ep_(KW),Fuel_consumption_,Electric_range_(km)
0,8000000,FR,IP-DGY____EAT82552-VR3-0,PSA,PSA AUTOMOBILES SA,e9*2018/858*11066*03,N,D,DGYP-A1C000,CITROEN,...,1888.0,2785.0,1600.0,1605.0,PETROL/ELECTRIC,P,1598.000000,132.0,1.300000,59.0
1,8000001,IE,IP-050669-NLH-1,HYUNDAI ASSAN,HYUNDAI ASSAN OTOMOTIV SANAYI VE TICARET AS,e5*2007/46*0121*01,BC3,B5P51,D71CZ2,HYUNDAI,...,1297.0,2580.0,1531.0,1536.0,PETROL,M,998.000000,73.0,4.700000,0.0
2,8000002,IE,IP-050669-NLH-1,HYUNDAI ASSAN,HYUNDAI ASSAN OTOMOTIV SANAYI VE TICARET AS,e5*2007/46*0121*01,BC3,B5P51,D71CZ2,HYUNDAI,...,1297.0,2580.0,1531.0,1536.0,PETROL,M,998.000000,73.0,4.700000,0.0
3,8000003,IE,IP-050669-NLH-1,HYUNDAI ASSAN,HYUNDAI ASSAN OTOMOTIV SANAYI VE TICARET AS,e5*2007/46*0121*01,BC3,B5P51,D71CZ2,HYUNDAI,...,1297.0,2580.0,1531.0,1536.0,PETROL,M,998.000000,73.0,4.700000,0.0
4,8000004,IE,IP-050669-NLH-1,HYUNDAI ASSAN,HYUNDAI ASSAN OTOMOTIV SANAYI VE TICARET AS,e5*2007/46*0121*01,BC3,B5P51,D71CZ2,HYUNDAI,...,1297.0,2580.0,1531.0,1536.0,PETROL,M,998.000000,73.0,4.700000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892908,9892908,BE,IP-JBA1MMP010A_001-VF1-1,RENAULT,RENAULT SAS,e2*2007/46*0684*14,RJB,HH2,MM4UA0EA5010,RENAULT,...,1698.0,2639.0,1554.0,1541.0,PETROL/ELECTRIC,P,1598.000000,68.0,1.300000,49.0
1892909,9892909,BE,IP-2021_4038-W1K-1,MERCEDES-BENZ AG,MERCEDES-BENZ AG,e1*2007/46*1909*14,F2B,OJEQZ2,ZZAAA53A,MERCEDES-BENZ,...,2121.0,2729.0,1608.0,1611.0,ELECTRIC,E,2086.174556,140.0,0.000000,421.0
1892910,9892910,BE,IP-MEB31AZ_A0_0996-WVW-1,VOLKSWAGEN,VOLKSWAGEN AG,e1*2018/858*00004*06,E2,4ACEBJAL1FX2,OPE1MH0020M1ASA,VOLKSWAGEN VW,...,2260.0,2771.0,1587.0,1562.0,ELECTRIC,E,2155.314657,150.0,0.000000,511.0
1892911,9892911,BE,IP-0153-JT1-1,TOYOTA MOTOR CORPORATION,TOYOTA MOTOR CORPORATION,e6*2018/858*00013*01,XPB1F(M),MXPJ10(H),MXPJ10L-BHXGBW(3B),TOYOTA,...,1412.0,2560.0,1520.0,1520.0,PETROL,H,1490.000000,68.0,5.000000,0.0


In [58]:
X_test

Unnamed: 0,VFN,Tan,T,Va,Ve,Cn,Ft,Fm,Fuel_consumption_,Electric_range_(km)
3431455,541.418122,504.126596,454.180085,526.589783,548.031617,453.198827,456.840995,439.290394,6.700000,0.0
3298816,398.443471,340.752725,400.613635,380.349851,426.018143,425.750828,456.840995,439.248888,5.059801,0.0
4845920,374.302058,402.097724,399.685609,374.240243,372.990767,390.985805,420.691181,439.290394,4.500000,0.0
4882133,332.150478,359.062440,361.950500,338.934282,331.249879,352.154618,330.770257,439.227978,5.300000,0.0
1106237,333.151271,343.637431,345.905785,333.512191,325.445938,350.409229,420.672344,408.508428,4.900000,0.0
...,...,...,...,...,...,...,...,...,...,...
4182108,154.136568,237.174374,340.501093,141.765253,387.740080,148.721818,108.955562,109.046594,2.000000,50.0
6213360,343.453133,345.150458,352.539151,350.777765,342.157056,356.929643,420.672344,408.508428,4.800000,0.0
4640688,0.070540,0.087334,0.005997,0.064568,0.036339,0.171816,0.002392,0.003680,0.000000,402.0
871141,769.810976,586.000099,582.582361,758.556071,757.903577,510.877271,420.626934,408.508428,10.700000,0.0


In [69]:
from sklearn.preprocessing import TargetEncoder
var_explicatives = ['VFN', 'Tan', 'T', 'Va', 'Ve', 'Cn', 'Ft', 'Fm', 'Fuel_consumption_', 'Electric_range_(km)']
test_rf = clean_test[var_explicatives]


encoder = TargetEncoder()
cat_col = clean_train[var_explicatives].select_dtypes(include="object")
for col in cat_col:
    X_train[col] = encoder.fit_transform(X_train[col].values.reshape(-1,1),y_train)
    X_test[col] = encoder.transform(X_test[col].values.reshape(-1,1))
    test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rf[col] = encoder.transform(test_rf[col].values.reshape(-1,1))
A

0
1
2
3
4
...
7571644
7571645
7571646
7571647
7571648


In [74]:
prediction = xgboost.predict(test_rf)

submission = clean_test[["ID"]].copy()  
submission["Ewltp (g/km)"] = prediction
submission.to_csv("../data/sample_submission5.csv", index=False)

In [73]:
submission

Unnamed: 0,ID,Ewltp (g/km)
0,8000000,341.023465
1,8000001,341.103276
2,8000002,341.103276
3,8000003,341.103276
4,8000004,341.103276
...,...,...
1892908,9892908,339.940500
1892909,9892909,346.022273
1892910,9892910,346.022273
1892911,9892911,339.396284
