In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../src/")
import data_preparation as dp
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LinearRegression
import warnings
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings("ignore", category=FutureWarning)
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Import des données

In [2]:
train = pd.read_csv("../data/train_clean.csv")
test = pd.read_csv("../data/test_clean.csv")

In [3]:
preprocess = dp.DataPreparation(train, test)

In [4]:
train_clean, test_clean = preprocess.prepare_data()

Valeurs manquantes du train supprimées ✅
Valeurs manquantes du test supprimées ✅
Variables renommées ✅
Valeurs manquantes numériques imputées ✅
Valeurs manquantes catégorielles imputées ✅


In [5]:
X_train = train_clean.drop(columns = ["Ewltp_(g/km)", "ID"])
y_train = train_clean["Ewltp_(g/km)"]

KeyError: "['Ewltp (g/km)'] not found in axis"

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [None]:
numericals = train_clean.select_dtypes(exclude='object').columns.to_list()
categoricals = train_clean.select_dtypes(include='object').columns.to_list()

numericals.remove('ID')
numericals.remove('date')

# Choix des variables à inclure

In [None]:
y_corr = train[numericals].corr()
sns.heatmap(y_corr, annot=True)

In [None]:
correlation_vector = train[numericals].corr()["Ewltp (g/km)"][:]
correlation_vector = np.abs(correlation_vector)
correlation_vector = correlation_vector.sort_values(ascending=False)[1:]

In [None]:
var_explicatives = correlation_vector.index[:4].to_list()

# Modélisation

In [None]:
reg = LinearRegression().fit(X_train[var_explicatives], y_train)

In [None]:
pred = reg.predict(X_val[var_explicatives])

In [None]:
mae = mean_absolute_error(y_val, pred)

In [None]:
mae

# Test

In [None]:
test_pred = reg.predict(test_clean[var_explicatives])

In [None]:
sub = test[["ID", 'ec (cm3)']]

In [None]:
sub["Ewltp (g/km)"] = test_pred

In [None]:
sub.drop(columns=['ec (cm3)'], inplace = True)

In [None]:
sub.to_csv("../data/sample_submission2.csv", index=False)

# Correlation catégorielle

In [4]:
X_train = train.drop(columns=["Ewltp_(g/km)", "ID"])
y_train = train["Ewltp_(g/km)"]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [6]:
train = train.rename(columns={"Ewltp_(g/km)": "Ewltp_g_km"})

In [9]:
col_categoricals = train.select_dtypes(include=["object"]).columns.to_list()

In [16]:
formula = 'Ewltp_g_km ~ ' + ' + '.join(['C(' + var + ')' for var in col_categoricals])

model = ols(formula, data=train.sample(10000)).fit()
aov_table = sm.stats.anova_lm(model, typ=2) 

print('--------------------------- Table Anova ---------------------------')
print(aov_table)

print('--------------------------- Eta Carré ---------------------------')
eta = [aov_table['sum_sq'][i] / sum(aov_table['sum_sq']) for i in range(len(col_categoricals))]
for i in range(len(col_categoricals)):
    print(f'{col_categoricals[i]} : {round(eta[i], 2)*100}')

--------------------------- Table Anova ---------------------------
                  sum_sq      df             F         PR(>F)
C(Country)  7.095331e+03    28.0      2.860691   7.519735e-07
C(VFN)      2.245715e+07  1771.0    143.150348   0.000000e+00
C(Mh)       4.673946e+06    64.0    824.441758   0.000000e+00
C(Man)      4.113907e+06    63.0    737.174193   0.000000e+00
C(Tan)      1.729563e+07  1586.0    123.108896   0.000000e+00
C(T)        1.183919e+07   380.0    351.717925   0.000000e+00
C(Va)       2.165030e+07  1343.0    181.988609   0.000000e+00
C(Ve)       2.563572e+07  3084.0     93.839867   0.000000e+00
C(Mk)       7.892996e+06    89.0   1001.170592   0.000000e+00
C(Cn)       2.149978e+07  1188.0    204.302598   0.000000e+00
C(Ct)       8.494822e+04     2.0    479.491058  1.524793e-190
C(Cr)       1.511453e+04     1.0    170.628254   2.422057e-38
C(Ft)       3.346387e+06     9.0   4197.491964   0.000000e+00
C(Fm)       8.207048e+06     5.0  18529.904644   0.000000e+00
Re

In [20]:
contrib = {}
for i in range(len(col_categoricals)):
    contrib[col_categoricals[i]] = round(eta[i], 2)*100

In [26]:
dict(sorted(contrib.items(), key=lambda item: item[1]))

{'Country': 0.0,
 'Ct': 0.0,
 'Cr': 0.0,
 'Ft': 2.0,
 'Mh': 3.0,
 'Man': 3.0,
 'Mk': 5.0,
 'Fm': 6.0,
 'T': 8.0,
 'Tan': 12.0,
 'Cn': 14.000000000000002,
 'VFN': 15.0,
 'Va': 15.0,
 'Ve': 17.0}

In [30]:
train["VFN"]

0          IP-C519_2022_00008-WF0-1
1          IP-MQB27ZZ_A2_0529-WVW-1
2          IP-MQB27ZZ_A1_0533-WVW-1
3               IP-04-U5Y-2018-1137
4          IP-MQB27ZZ_B2_0534-WVW-1
                     ...           
7571644    IP-MLB42AZ_B0_0685-WAU-1
7571645    IP-MLB42AZ_B0_0685-WAU-1
7571646    IP-MLB42AZ_B0_0685-WAU-1
7571647    IP-K14A1RTP6DAF_00-JN1-1
7571648        IP-20_GR6_0023-JHM-1
Name: VFN, Length: 7571649, dtype: object

# Feature Engineering

In [None]:
vehicles = pd.read_csv("../data/all-vehicles-model.csv", sep = ';')

In [None]:
vehicles

In [None]:
modeles = list(vehicles["Model"].unique())

In [None]:
import Levenshtein
from tqdm import tqdm
modeles2 = list(train_clean["Cn"].unique())
commun = {}
for element in tqdm(modeles2) : 
    commun[element]=[]
    similarity_score = {element2 : Levenshtein.ratio(element, element2) for element2 in modeles}
    for keys, value in similarity_score.items():
        if value > 0.6 :
            commun[element].append(vehicles[vehicles["Model"]==keys]["Vehicle Size Class"].to_list()[0])


In [None]:
commun

In [None]:
from statistics import mode

commun_red = {keys : mode(value)for keys,value in commun.items() if len(value) > 0}

In [None]:
len(commun_red)

In [None]:
train_clean["Cn"].nunique()

In [31]:
a = pd.read_csv("../data/train.csv")

  a = pd.read_csv("../data/train.csv")


In [42]:
a.loc[a["ec (cm3)"].isna()]["Ft"].value_counts()

Ft
ELECTRIC           1021754
HYDROGEN               981
NG                      12
PETROL/ELECTRIC          9
PETROL                   8
DIESEL                   1
Name: count, dtype: int64

In [41]:
a.loc[1185643]

ID                                  1185643
Country                                  ES
VFN                                     NaN
Mp                                      NaN
Mh                             OUT OF SCOPE
Man                     AUTOMOBILES CITROEN
MMS                                     NaN
Tan                     E49*2007/46*0024*12
T                                      PLA9
Va                                    2B2MS
Ve                                 3221CE3R
Mk                               GIOTTILINE
Cn                                      322
Ct                                       M1
Cr                                       M1
r                                         1
m (kg)                               2778.0
Mt                                      NaN
Enedc (g/km)                            NaN
Ewltp (g/km)                     847.146825
W (mm)                               3450.0
At1 (mm)                             1810.0
At2 (mm)                        