<a href="https://colab.research.google.com/github/Fondaaaa/Templates/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer,OneHotEncoder,StandardScaler,FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


path = "https://frenzy86.s3.eu-west-2.amazonaws.com/python/penguins.csv"


df = pd.read_csv(path)

In [None]:
#to_drop = ["nome_colonna1","nome_colonna2",...] colonne da rimuovere
#df = df.drop(columns=to_drop)


In [None]:
X = df.drop(columns=["species"])
y = df['species']

In [None]:
numerical_features = [x for x, dtype in zip(X.columns, X.dtypes) if dtype.kind in ['i','f'] ]
categorical_features = [x for x, dtype in zip(X.columns, X.dtypes) if dtype.kind not in ['i','f']]

In [None]:
# processa + riempie nulli
preprocessor = make_column_transformer(
                                        (make_pipeline(
                                        SimpleImputer(strategy = 'median'),
                                        KBinsDiscretizer(n_bins=3)), numerical_features),

                                        (make_pipeline(
                                        #FunctionTransformer(lambda x: np.where(x == 'n.d.', np.nan, x)),
                                        SimpleImputer(strategy = 'constant', fill_value = 'missing'),
                                        OneHotEncoder(categories = 'auto', handle_unknown = 'ignore',drop='first')), categorical_features)
                                        )

In [None]:
scaler = StandardScaler()

#usare modello migliore + parametri gridsearch
#sia per classificazione che per regressione
classifier = RandomForestClassifier(bootstrap=True,
                                    max_depth=80,
                                    max_features=2,
                                    min_samples_leaf=3,
                                    min_samples_split=8,
                                    n_estimators=100
                                    )

In [None]:
model_pipe = Pipeline([
                       ('Preprocessing features', preprocessor),
                       ('Scaling and standadize data', scaler),
                       ('Classifier', classifier)
                      ])
model_pipe

In [None]:
cross_val_scores = cross_val_score(model_pipe, X, y, cv=5)
print(cross_val_scores)

# Calcola la media e la deviazione standard dei punteggi
mean_score = np.mean(cross_val_scores)
std_score = np.std(cross_val_scores)

print(f"Mean cross-validation score:{mean_score:.2f}")
print(f"Standard deviation of cross-validation scores:{std_score:.2f}")

[0.97014925 0.98507463 0.98507463 1.         0.98484848]
Mean cross-validation score:0.99
Standard deviation of cross-validation scores:0.01


In [None]:
model_pipe.fit(X, y)

In [None]:
y_pred_tot = model_pipe.predict(X)

In [None]:
from sklearn.metrics import classification_report

classification_report(y,y_pred_tot )

classification_report_result = classification_report(y, y_pred_tot)
print(classification_report_result)



              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99       146
   Chinstrap       1.00      0.96      0.98        68
      Gentoo       1.00      1.00      1.00       119

    accuracy                           0.99       333
   macro avg       0.99      0.99      0.99       333
weighted avg       0.99      0.99      0.99       333



In [None]:
import joblib
joblib.dump(model_pipe,'penguins_pipe.pkl')


['penguins_pipe.pkl']

In [None]:
model_pipe = joblib.load("penguins_pipe.pkl")