In [127]:
from src.modules.bear_cleaner import *
from src.modules.models import *
from sklearn.model_selection import train_test_split

# Importation des données

In [128]:
df = data_model("./data/vins.json")
df = df[0:100]

# Choix des variables expliqués et explicatives 

In [129]:
X = df[["capacity", "keyword_1", "keyword_2", "keyword_3", "millesime", "cepage", "par_gouts", "service",
        "avg_temp","conservation_date", "bio", "customer_fav", "is_new", "top_100", "destock", "sulphite_free",
        "alcohol_volume", "type","country", "bubbles", "wine_note", "nb_reviews"]]
y = df["unit_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modèles

In [130]:
model = train_model(X_train, y_train)


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.



In [131]:
model_result(knn=model["model_knn"],svm=model["model_svm"],mlp=model["model_mlp"],
             rf=model["model_rf"],boost=model["model_boost"],ridge=model["model_ridge"])

+--------+-------+-------+
| Modèle | Score |   SD  |
+--------+-------+-------+
|  knn   | 0.178 | 0.203 |
|  svm   | 0.273 | 0.165 |
|  mlp   | 0.102 | 0.272 |
|   rf   | 0.153 | 0.116 |
| boost  |  0.1  | 0.223 |
| ridge  | 0.105 | 0.144 |
+--------+-------+-------+


Il faut bien mettre "entrainement__nom_du_parametre_optimisé"

In [132]:
model_param(model["model_mlp"], 
            "imputation__strategy",
            "entrainement__hidden_layer_sizes",
            "entrainement__solver")

+----------------------------------+--------+
|            Parameter             | Value  |
+----------------------------------+--------+
|       imputation__strategy       |  mean  |
| entrainement__hidden_layer_sizes | (100,) |
|       entrainement__solver       |  adam  |
+----------------------------------+--------+


Remember (ne noter que si on trouve mieux) : 
Meilleurs modèles :
- MLP - 0.419 imputation__strategy  : median  | entrainement__hidden_layer_sizes | (60, 60) | entrainement__solver : adam 
- RF - 0.422 - imputation__strategy : mean | entrainement__n_estimators :  20  |  entrainement__max_depth :  9 

# Création DF pour résultat

In [133]:
def score(model):
    indice_meilleur =model.cv_results_["rank_test_score"].argmin()
    return round(model.cv_results_["mean_test_score"][indice_meilleur], 3)

In [134]:
def ecart_type(model):
    indice_meilleur =model.cv_results_["rank_test_score"].argmin()
    return round(model.cv_results_["std_test_score"][indice_meilleur], 3)

In [135]:
def parametre(model) :
    indice_meilleur = model.cv_results_["rank_test_score"].argmin()
    return str(model.cv_results_["params"][indice_meilleur])

In [136]:
def stockage_result_csv(model):
    ml = {"Modèle": ["Random Forest", "K Neighbors",
                    "Réseaux de neurones", "Boosting",
                    "Ridge", "Support Vector"],
        "Score": [score(model["model_rf"]), score(model["model_knn"]),
                    score(model["model_mlp"]), score(model["model_boost"]),
                    score(model["model_ridge"]), score(model["model_svm"])],
        "Ecart-Type" : [ecart_type(model["model_rf"]), ecart_type(model["model_knn"]),
                    ecart_type(model["model_mlp"]), ecart_type(model["model_boost"]),
                    ecart_type(model["model_ridge"]), ecart_type(model["model_svm"])],
        "Paramètres" : [parametre(model["model_rf"]), parametre(model["model_knn"]),
                    parametre(model["model_mlp"]), parametre(model["model_boost"]),
                    parametre(model["model_ridge"]), parametre(model["model_svm"])]}
    ml = pl.DataFrame(ml)
    ml.write_csv("./data/result_ml.csv", separator=",")
    return print("C'est bon ça a marché")

In [137]:
stockage_result_csv(model)

C'est bon ça a marché


In [138]:
pl.read_csv('./data/result_ml.csv')

Modèle,Score,Ecart-Type,Paramètres
str,f64,f64,str
"""Random Forest""",0.153,0.116,"""{'entrainement…"
"""K Neighbors""",0.178,0.203,"""{'entrainement…"
"""Réseaux de neu…",0.102,0.272,"""{'entrainement…"
"""Boosting""",0.1,0.223,"""{'entrainement…"
"""Ridge""",0.105,0.144,"""{'entrainement…"
"""Support Vector…",0.273,0.165,"""{'entrainement…"


# AUTRES

In [139]:
df = pl.read_json("./data/vins.json")
df = super_pipe(df)
df = df.filter(pl.col("unit_price").is_not_null())

In [140]:
variables = ["capacity", "unit_price","millesime", "avg_temp", "conservation_date",
           "bio","customer_fav", "is_new","top_100","destock","sulphite_free", "alcohol_volume",
           "bubbles"]

In [141]:
df_2 = df[variables].drop_nulls()

# CLASSIFICATION

Je dois transformer mes variables qualitative en quantitative avec LabelEncoder() pour attribuer à chaque modalité un nombre

EDIT : => CHANGEMENT 

Je fais un MixMaxScaler() (=> ça entraine normalement une augmentation de mon score) mais ça modifie egalement pour mes variables qualitatives, je les passent en binaire pour eviter que le MixMaxScaler() les touches. 

In [267]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import polars as pl

In [268]:
df = pl.read_json("./data/vins.json")
df = super_pipe(df)
df = df.filter(pl.col("type").is_not_null())

In [269]:
df = df.select("capacity", "unit_price","millesime", "cepage", "par_gouts",
          "service", "avg_temp", "conservation_date", "bio", "customer_fav", 
          "is_new", "top_100", "destock", "sulphite_free", "alcohol_volume",
          "country", "bubbles", "wine_note", "nb_reviews", "conservation_time",
          "type")

In [270]:
def prep_str(df):
    """Transforme les variables texte en colonne binaire grâce à OneHotEncoder()
    
    Etant donné qu'il y a 32 pays cela va entrainer 32 colonnes binaire."""
    df = df.to_pandas()
    df_prep = df
    encoder = OneHotEncoder()
    categorical_cols = ["cepage", "par_gouts", "service", "country"]
    
    encoded = encoder.fit_transform(
        df_prep[categorical_cols]
        ).toarray()

    df_encoded = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(categorical_cols)
        )

    df_prep = pd.concat(
        [df_prep.drop(columns=categorical_cols), df_encoded],axis=1)
    
    return df_prep


In [271]:
df = prep_str(df)

In [272]:
X = df.drop(columns=['type'])
y = df["type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [273]:
def model_knn(x_train, y_train):
    """
    paramètres optimisés :
    -n_neighbors
    """
    model = Pipeline(
        [
            ("imputation", SimpleImputer()),
            ("echelle", MinMaxScaler()),
            ("entrainement", KNeighborsClassifier()),
        ]
    )
    cv = GridSearchCV(
        estimator=model,
        param_grid={
            "imputation__strategy": ["mean", "median", "most_frequent"],
            "entrainement__n_neighbors": range(2, 15),
        },
        n_jobs=-1,
    )
    cv.fit(x_train, y_train)
    return cv

In [274]:
score(model_knn(X_train, y_train))

0.959