In [2]:
from src.modules.bear_cleaner import *
from src.modules.models import *
from sklearn.model_selection import train_test_split

# Importation des données

In [3]:
df = data_model("./data/vins.json")
df = df[0:100]

# Choix des variables expliqués et explicatives 

In [4]:
X = df[["capacity", "keyword_1", "keyword_2", "keyword_3", "millesime", "cepage", "par_gouts", "service",
        "avg_temp","conservation_date", "bio", "customer_fav", "is_new", "top_100", "destock", "sulphite_free",
        "alcohol_volume", "type","country", "bubbles", "wine_note", "nb_reviews"]]
y = df["unit_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modèles

In [5]:
model = train_model(X_train, y_train)



In [6]:
model_result(knn=model["model_knn"],svm=model["model_svm"],mlp=model["model_mlp"],
             rf=model["model_rf"],boost=model["model_boost"],ridge=model["model_ridge"])

+--------+--------+-------+
| Modèle | Score  |   SD  |
+--------+--------+-------+
|  knn   | 0.004  | 0.285 |
|  svm   | 0.116  | 0.104 |
|  mlp   | -0.211 | 0.999 |
|   rf   | -0.052 | 0.455 |
| boost  | -0.038 | 0.287 |
| ridge  | -0.506 | 1.118 |
+--------+--------+-------+


Il faut bien mettre "entrainement__nom_du_parametre_optimisé"

In [7]:
model_param(model["model_mlp"], 
            "imputation__strategy",
            "entrainement__hidden_layer_sizes",
            "entrainement__solver")

+----------------------------------+----------+
|            Parameter             |  Value   |
+----------------------------------+----------+
|       imputation__strategy       |  median  |
| entrainement__hidden_layer_sizes | (60, 60) |
|       entrainement__solver       |   adam   |
+----------------------------------+----------+


Remember (ne noter que si on trouve mieux) : 
Meilleurs modèles :
- MLP - 0.419 imputation__strategy  : median  | entrainement__hidden_layer_sizes | (60, 60) | entrainement__solver : adam 
- RF - 0.422 - imputation__strategy : mean | entrainement__n_estimators :  20  |  entrainement__max_depth :  9 

# Création DF pour résultat

In [8]:
def score(model):
    indice_meilleur =model.cv_results_["rank_test_score"].argmin()
    return round(model.cv_results_["mean_test_score"][indice_meilleur], 3)

In [9]:
def ecart_type(model):
    indice_meilleur =model.cv_results_["rank_test_score"].argmin()
    return round(model.cv_results_["std_test_score"][indice_meilleur], 3)

In [10]:
def parametre(model) :
    indice_meilleur = model.cv_results_["rank_test_score"].argmin()
    return str(model.cv_results_["params"][indice_meilleur])

In [11]:
def stockage_result_csv(model):
    ml = {"Modèle": ["Random Forest", "K Neighbors",
                    "Réseaux de neurones", "Boosting",
                    "Ridge", "Support Vector"],
        "Score": [score(model["model_rf"]), score(model["model_knn"]),
                    score(model["model_mlp"]), score(model["model_boost"]),
                    score(model["model_ridge"]), score(model["model_svm"])],
        "Ecart-Type" : [ecart_type(model["model_rf"]), ecart_type(model["model_knn"]),
                    ecart_type(model["model_mlp"]), ecart_type(model["model_boost"]),
                    ecart_type(model["model_ridge"]), ecart_type(model["model_svm"])],
        "Paramètres" : [parametre(model["model_rf"]), parametre(model["model_knn"]),
                    parametre(model["model_mlp"]), parametre(model["model_boost"]),
                    parametre(model["model_ridge"]), parametre(model["model_svm"])]}
    ml = pl.DataFrame(ml)
    ml.write_csv("./data/result_ml.csv", separator=",")
    return print("C'est bon ça a marché")

In [12]:
stockage_result_csv(model)

C'est bon ça a marché


In [13]:
pl.read_csv('./data/result_ml.csv')

Modèle,Score,Ecart-Type,Paramètres
str,f64,f64,str
"""Random Forest""",-0.052,0.455,"""{'entrainement…"
"""K Neighbors""",0.004,0.285,"""{'entrainement…"
"""Réseaux de neu…",-0.211,0.999,"""{'entrainement…"
"""Boosting""",-0.038,0.287,"""{'entrainement…"
"""Ridge""",-0.506,1.118,"""{'entrainement…"
"""Support Vector…",0.116,0.104,"""{'entrainement…"


# AUTRES

In [104]:
df = pl.read_json("./data/vins.json")
df = super_pipe(df)
df = df.filter(pl.col("unit_price").is_not_null())

In [84]:
variables = ["capacity", "unit_price","millesime", "avg_temp", "conservation_date",
           "bio","customer_fav", "is_new","top_100","destock","sulphite_free", "alcohol_volume",
           "bubbles"]

In [85]:
df_2 = df[variables].drop_nulls()

In [103]:
import plotly.figure_factory as ff
import numpy as np

fig = ff.create_annotated_heatmap(z=np.array(df_2.corr()), x=variables, y=variables,
                                  annotation_text = np.around(np.array(df_2.corr()), decimals=2),
                                  colorscale='Cividis')
fig.update_layout(title_text='Matrice de corrélation')

# Affichage du graphique
fig.show()