In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from flask import Flask, jsonify, request

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, balanced_accuracy_score, ConfusionMatrixDisplay

from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv("./data/wines_dataset.csv", sep = "|")

In [3]:
train, test = train_test_split(df, test_size= 0.1, random_state= 42)

In [4]:
target = "class"
train[target] = (train[target] == "white").astype(int)
X_train = train.drop(columns = target)
y_train = train[target].copy()

In [52]:
rf_selection = RandomForestClassifier(max_depth= 10, class_weight= "balanced")
rfe = RFE(rf_selection, n_features_to_select=4)
X_train_s = pd.DataFrame(rfe.fit_transform(X_train, y_train), columns= [f"Column_{i}" for i in range(4)])

In [10]:
X_train_array = rfe.fit_transform(X_train, y_train)
X_train_array

array([[1.20e-01, 3.40e-02, 9.90e+01, 5.70e-01],
       [5.50e-01, 7.60e-02, 4.00e+01, 5.90e-01],
       [1.70e-01, 9.30e-02, 1.36e+02, 4.90e-01],
       ...,
       [2.00e-01, 5.20e-02, 1.45e+02, 5.60e-01],
       [4.10e-01, 3.20e-02, 7.40e+01, 3.50e-01],
       [1.80e-01, 3.90e-02, 1.39e+02, 4.40e-01]], shape=(5847, 4))

In [56]:

model = LGBMClassifier(max_depth= 10, class_weight= "balanced", random_state= 42, verbose = 0)

pipeline = Pipeline(
    [("Imputer", SimpleImputer()),
     ("Model", model)]
)

pipeline.fit(X_train_s, y_train)

In [57]:
with open("wine_model.pkl", "wb") as model:
    pickle.dump(pipeline, model)

In [19]:
X_test = test.drop(columns=target).copy()
y_test = (test[target] == "white").astype(int)

In [20]:
X_test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3103,7.4,0.32,0.27,1.4,0.049,38.0,173.0,0.99335,3.03,0.52,9.3,5
1419,6.6,0.34,0.24,3.3,0.034,29.0,99.0,0.99031,3.10,0.40,12.3,7
4761,6.4,0.32,0.35,4.8,0.030,34.0,101.0,0.99120,3.36,0.60,12.5,8
4690,6.8,0.23,0.32,1.6,0.026,43.0,147.0,0.99040,3.29,0.54,12.5,6
4032,6.7,0.34,0.26,1.9,0.038,58.0,138.0,0.98930,3.00,0.47,12.2,7
...,...,...,...,...,...,...,...,...,...,...,...,...
4331,8.0,0.27,0.33,1.2,0.050,41.0,103.0,0.99002,3.00,0.45,12.4,6
259,5.2,0.44,0.04,1.4,0.036,43.0,119.0,0.98940,3.36,0.33,12.1,8
2364,7.0,0.23,0.33,5.8,0.040,25.0,136.0,0.99500,3.19,0.58,9.5,6
1760,7.1,0.38,0.29,13.6,0.041,30.0,137.0,0.99461,3.02,0.96,12.1,6


In [13]:
X_test.columns = [col.replace(" ", "_") for col in X_test.columns]

In [27]:
X_test = pd.DataFrame(rfe.transform(X_test), columns = [f"Column_{i}" for i in range(4)])

In [25]:
model = LGBMClassifier(max_depth= 10, class_weight= "balanced", random_state= 42, verbose = 0)
model.fit(X_train_array, y_train.values)

In [28]:
model.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,

In [23]:
y_pred = pipeline.predict(X_test_array)



In [60]:
def predict(X):
    with open("wine_model_2.pkl", "rb") as f:
        model = pickle.load(f)

    def prediction(X):
        pred = model.predict(X)
        return dict({f"prediction_{i}": pred[i].astype(np.int8) for i in range(len(X))})

    acidity = X[0][0]
    chlorides = X[0][1]
    so2 = X[0][2]
    sulphates = X[0][3]

    X = [acidity, chlorides, so2, sulphates]
    X = pd.DataFrame([X], columns = [f"Column_{i}" for i in range(4)])
    print(X)
    def type_processing(X):
        for col in X:
            X[col] = pd.to_numeric(X[col])
        return X
    
    X = type_processing(X)
    print(X.dtypes)

    # if len(X[X == None]) >= 2:
    #     cont = input("La mitad o más de los valores son faltantes, si quieres continuar pulsa 's', si no 'n'").lower()
    #     if cont == "s":
    #         return prediction(X)
    #     else:
    #         return "Te faltaban muchos datos y has decidido no hacer una predicción"
        
    return prediction(X)

In [61]:
predict([["0.12", "0.034", "99", "0.57"]])

  Column_0 Column_1 Column_2 Column_3
0     0.12    0.034       99     0.57
Column_0    float64
Column_1    float64
Column_2      int64
Column_3    float64
dtype: object


{'prediction_0': np.int8(1)}

In [37]:
def prediction(X):
    pred = model.predict(X)
    return dict({f"prediction_{i}": pred[i].astype(np.int8) for i in range(len(X))})


In [38]:
prediction(X_test)

{'prediction_0': np.int8(1),
 'prediction_1': np.int8(1),
 'prediction_2': np.int8(1),
 'prediction_3': np.int8(1),
 'prediction_4': np.int8(1),
 'prediction_5': np.int8(1),
 'prediction_6': np.int8(1),
 'prediction_7': np.int8(0),
 'prediction_8': np.int8(1),
 'prediction_9': np.int8(1),
 'prediction_10': np.int8(1),
 'prediction_11': np.int8(1),
 'prediction_12': np.int8(1),
 'prediction_13': np.int8(1),
 'prediction_14': np.int8(1),
 'prediction_15': np.int8(1),
 'prediction_16': np.int8(1),
 'prediction_17': np.int8(1),
 'prediction_18': np.int8(1),
 'prediction_19': np.int8(1),
 'prediction_20': np.int8(1),
 'prediction_21': np.int8(0),
 'prediction_22': np.int8(1),
 'prediction_23': np.int8(1),
 'prediction_24': np.int8(0),
 'prediction_25': np.int8(1),
 'prediction_26': np.int8(1),
 'prediction_27': np.int8(1),
 'prediction_28': np.int8(1),
 'prediction_29': np.int8(1),
 'prediction_30': np.int8(1),
 'prediction_31': np.int8(1),
 'prediction_32': np.int8(1),
 'prediction_33': np