In [3]:
import pandas as pd
data = pd.read_csv(r"C:\Users\quent\Documents\GitHub\green_ia\machine_learning\ecoscore.csv", encoding='utf-8', sep='\t', low_memory=False)

In [4]:
# on supprime les colonnes dont le type n'est pas numérique
data = data.select_dtypes(include=['int64', 'float64'])

# on remplace les valeurs manquantes par la moyenne de la colonne
data = data.fillna(data.mean())

# on normalise les données
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [5]:
# on entraine plusieur modèles sur la prédiction de la valeur ecoscore_score et on print leur accuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import mean_squared_error

X = data.drop(columns=['ecoscore_score'])
y = data['ecoscore_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("linearRegression RMSE:", rmse)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RandomForestRegressor RMSE:", rmse)


from sklearn.svm import SVR
model = SVR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("SVR RMSE:", rmse)

from sklearn.neural_network import MLPRegressor
model = MLPRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("MLP RMSE:", rmse)

linearRegression RMSE: 20.683719817350315
RandomForestRegressor RMSE: 16.2030153962689
SVR RMSE: 23.97179529279924
MLP RMSE: 9431949653.957603


In [6]:
# on prédit les eco_score des 100 premiers produits avec le random forest et on fais la moyenne des écart avec la valeur réelle
model = RandomForestRegressor()
model.fit(X_train, y_train)
for i in range(100):
    print("ecoscore_score:", model.predict([X_test.iloc[i]])[0], "real:", y_test.iloc[i])



ecoscore_score: 74.81 real: 72.0
ecoscore_score: 74.76 real: 66.0
ecoscore_score: 73.75 real: 79.0
ecoscore_score: 50.79 real: 9.0
ecoscore_score: 55.58 real: 114.0
ecoscore_score: 72.55 real: 79.0
ecoscore_score: 20.43 real: 20.0
ecoscore_score: 51.21 real: 56.0
ecoscore_score: 76.52 real: 77.0
ecoscore_score: 83.12 real: 103.0
ecoscore_score: 24.69 real: 50.0
ecoscore_score: 27.64 real: 11.0
ecoscore_score: 61.15 real: 47.0
ecoscore_score: 49.32 real: 78.0
ecoscore_score: 42.98 real: 62.0
ecoscore_score: 70.19 real: 64.0
ecoscore_score: 81.56 real: 80.0
ecoscore_score: 84.93 real: 75.0
ecoscore_score: 66.81 real: 74.0
ecoscore_score: 60.03 real: 79.0
ecoscore_score: 65.92 real: 66.0
ecoscore_score: 55.38 real: 55.0
ecoscore_score: 34.4 real: 9.0
ecoscore_score: 77.9 real: 79.0
ecoscore_score: 72.72 real: 67.0
ecoscore_score: 42.31 real: 52.0
ecoscore_score: 61.83 real: 54.0
ecoscore_score: 57.78 real: 66.0
ecoscore_score: 62.55 real: 66.0
ecoscore_score: 68.56 real: 66.0
ecoscore_sco



ecoscore_score: 82.3 real: 79.0
ecoscore_score: 32.81 real: 20.0
ecoscore_score: 52.58 real: 61.0
ecoscore_score: 81.8 real: 77.0
ecoscore_score: 37.51 real: 41.0
ecoscore_score: 64.53 real: 58.0
ecoscore_score: 59.58 real: 74.0
ecoscore_score: 81.85 real: 77.0
ecoscore_score: 73.92 real: 73.0
ecoscore_score: 29.02 real: 18.0
ecoscore_score: 66.38 real: 56.0
ecoscore_score: 49.3 real: 63.0
ecoscore_score: 65.6 real: 105.0
ecoscore_score: 66.64 real: 66.0
ecoscore_score: 58.45 real: 29.0
ecoscore_score: 66.77 real: 52.0
ecoscore_score: 59.07 real: 64.0
ecoscore_score: 63.26 real: 69.0
ecoscore_score: 69.97 real: 42.0
ecoscore_score: 46.95 real: 44.0
ecoscore_score: 81.06 real: 79.0
ecoscore_score: 55.74 real: 66.0
ecoscore_score: 52.43 real: 35.0
ecoscore_score: 50.78 real: 55.0
ecoscore_score: 27.7 real: 36.0
ecoscore_score: 73.34 real: 72.0
ecoscore_score: 59.34 real: 62.0
ecoscore_score: 53.06 real: 47.0
ecoscore_score: 52.28 real: 55.0
ecoscore_score: 53.94 real: 66.0
ecoscore_score



In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the random forest regressor
rf = RandomForestRegressor()

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

In [None]:
# Retrain the model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
data_test = pd.read_csv(r"C:\Users\quent\Documents\GitHub\green_ia\machine_learning\ecoscore_test.csv", encoding='utf-8', sep='\t', low_memory=False)

NameError: name 'pd' is not defined

In [None]:
data_test_without_ecoscore = data_test.select_dtypes(include=['int64', 'float64'])
# on remplace les valeurs manquantes par la moyenne de la colonne
data_test_without_ecoscore = data.fillna(data.mean())

In [None]:
data.info()

In [None]:
data_test_without_ecoscore.info()

NameError: name 'data_test_without_ecoscore' is not defined

In [None]:
for i in range(30):
    print("ecoscore_score:", best_model.predict([data_test_without_ecoscore.iloc[i]])[0], "real:", data_test['ecoscore_score'].iloc[i])



ValueError: X has 22 features, but RandomForestRegressor is expecting 21 features as input.