In [None]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
df = sns.load_dataset('diamonds')
X = df.drop('cut', axis=1)
y = df['cut'] 
df.isna().sum()
la_encoder = LabelEncoder()
y_encoded = la_encoder.fit_transform(y)
categorical_cols = ['color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)
column_transformer = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)
pipeline = make_pipeline(
    column_transformer,
    RandomForestClassifier(random_state=42)
)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=la_encoder.classes_))
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
joblib.dump(pipeline, '../models/pipeline_clasificacion.joblib')
X_new = pd.DataFrame({
    'carat': [0.23],
    'color': ['E'],
    'clarity': ['SI2'],
    'depth': [61.5],
    'table': [55],
    'price': [326],
    'x': [3.95],
    'y': [3.98],
    'z': [2.43],                 
})
y_pred_new = pipeline.predict(X_new)
y_pred_label = la_encoder.inverse_transform(y_pred_new)
print("Predicción de clasificación para el nuevo dato:", y_pred_label)

Accuracy: 0.7560
Reporte de clasificación:
              precision    recall  f1-score   support

        Fair       0.91      0.87      0.89       335
        Good       0.74      0.65      0.69      1004
       Ideal       0.82      0.91      0.86      4292
     Premium       0.73      0.80      0.76      2775
   Very Good       0.62      0.46      0.52      2382

    accuracy                           0.76     10788
   macro avg       0.76      0.74      0.75     10788
weighted avg       0.75      0.76      0.75     10788

Matriz de confusión:
[[ 292   29    3    7    4]
 [  24  655   27   56  242]
 [   2   11 3918  179  182]
 [   0   15  308 2207  245]
 [   4  175  540  579 1084]]
Predicción de clasificación para el nuevo dato: ['Ideal']
