### **Dureza de aleacion de metal**
```
Autor       : Alex Harvey Pfoccori Quispe
Lugar       : Cusco, Peru, 2023
Proposito   : Predicción de calidad de vino según sus propiedades
```


# **0. PASOS PREVIOS**

## **Librerias**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Algoritmos de clasificacion: K-NN, AD, SVM, MLP, NB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Modulo para la separacion de datos para entrenamiento y test
from sklearn.model_selection import train_test_split

# Ensambles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Modulo que implementa busqueda aleatoria en cuadricula
from sklearn.model_selection import RandomizedSearchCV

# Validacion cruzada
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

%matplotlib inline

## **Importacion de datos**

In [2]:
datos = pd.read_csv('./winequality.csv')
datos.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,12.2,0.45,0.49,1.4,0.075,3.0,6.0,0.9969,3.13,0.63,10.4,5
1,12.2,0.45,0.49,1.4,0.075,3.0,6.0,0.9969,3.13,0.63,10.4,5
2,8.6,0.315,0.4,2.2,0.079,3.0,6.0,0.99512,3.27,0.67,11.9,6
3,9.8,0.34,0.39,1.4,0.066,3.0,7.0,0.9947,3.19,0.55,11.4,7
4,8.0,0.6,0.08,2.6,0.056,3.0,7.0,0.99286,3.22,0.37,13.0,5


In [3]:
## Informacion del dataset
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1593 entries, 0 to 1592
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1593 non-null   float64
 1   volatile acidity      1593 non-null   float64
 2   citric acid           1593 non-null   float64
 3   residual sugar        1593 non-null   float64
 4   chlorides             1593 non-null   float64
 5   free sulfur dioxide   1593 non-null   float64
 6   total sulfur dioxide  1593 non-null   float64
 7   density               1593 non-null   float64
 8   pH                    1593 non-null   float64
 9   sulphates             1593 non-null   float64
 10  alcohol               1593 non-null   float64
 11  quality               1593 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 149.5 KB


## **Separacion de datos para Training y Testing**

In [4]:
# Separacion de datos en entradas
x = datos.drop(['quality'], axis=1)
y = pd.DataFrame(datos['quality'])

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=123, train_size=0.75, shuffle=True)

## **Distribucion de datos**

In [5]:
datos.groupby('quality').size()

quality
4     63
5    681
6    633
7    216
dtype: int64

# **1. MODELO VOTING: AD + K-NN + SVC + MLP**

### Busqueda aleatoria de hiper-parametro en cuadricula, para **ARBOL DE DECISION**

In [6]:
# Creacion de cuadricula de busqueda
grid_aleatorio_ad = {'criterion': ['gini', 'entropy'], 'max_depth': [4, 8, 12, 20, 50, 120, 150]}
# Definicion del modelo 
arbol_decision = DecisionTreeClassifier()
# Busqueda aleatoria en cuadricula, con validacion cruzada
model_1 = RandomizedSearchCV(estimator=arbol_decision, param_distributions=grid_aleatorio_ad, n_iter=14, cv=5, verbose=2, random_state=42, n_jobs=-1)

model_1.fit(x_train, y_train)
print('Score de train: ', model_1.score(x_test, y_test))
print('Mejor modelo :', model_1.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


Score de train:  0.6190476190476191
Mejor modelo : {'max_depth': 50, 'criterion': 'entropy'}


### Busqueda aleatorio de hiper-parametro en cuadricula, para **K-VECINOS MAS CERCANOS**

In [7]:
grid_aleatorio_knn = {'n_neighbors': [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
knn = KNeighborsClassifier()
model_2 = RandomizedSearchCV(estimator=knn, param_distributions=grid_aleatorio_knn, n_iter=14, cv=5, verbose=2, random_state=42, n_jobs=-1)
model_2.fit(x_train, y_train)
print('Score de train: ', model_2.score(x_test, y_test))
print('Mejor modelo: ', model_2.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


Score de train:  0.506265664160401
Mejor modelo:  {'n_neighbors': 20}


  return self._fit(X, y)


### Busqueda aleatoria de hiper-parametro en cuadricula, para **PERCEPTRON MULTICAPA**

In [8]:
grid_aleatorio_mlp = {'hidden_layer_sizes': [(50, 20, 10, 5), (80, 40, 15, 5), (100, 50, 20, 8), (20, 10, 5)], 'activation': ['logistic', 'tanh'], 'alpha': [0.01, 0.05, 0.1, 0.25]}
mlp = MLPClassifier(max_iter=100)
model_3 = RandomizedSearchCV(estimator=mlp, param_distributions=grid_aleatorio_mlp, n_iter=14, cv=5, verbose=2, random_state=42, n_jobs=-1)
model_3.fit(x_train, y_train)
print('Score de train: ', model_3.score(x_test, y_test))
print('Mejor modelo: ', model_3.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


  y = column_or_1d(y, warn=True)


Score de train:  0.5864661654135338
Mejor modelo:  {'hidden_layer_sizes': (80, 40, 15, 5), 'alpha': 0.25, 'activation': 'tanh'}




In [9]:
print(model_1.best_params_)
print(model_2.best_params_)
print(model_3.best_params_)

{'max_depth': 50, 'criterion': 'entropy'}
{'n_neighbors': 20}
{'hidden_layer_sizes': (80, 40, 15, 5), 'alpha': 0.25, 'activation': 'tanh'}


## **Voting = AD + K-NN + SVC + MLP**

In [11]:
model_11 = DecisionTreeClassifier(criterion='entropy', max_depth=50)
model_22 = KNeighborsClassifier(n_neighbors=20)
model_33 = MLPClassifier(activation='tanh', alpha=0.25, hidden_layer_sizes=(80, 40, 15, 5), max_iter=100)

In [14]:
# Definir el ensamble Voting de clasificacion
EnsambleVoting = VotingClassifier(estimators=[('dt', model_11), ('knn', model_22), ('MLP', model_33)], voting='hard', weights=[1,3,10], n_jobs=-1)

# Entrenar ensamble
EnsambleVoting.fit(x_train, y_train)

print('Score de train: ', EnsambleVoting.score(x_test, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Score de train:  0.5789473684210527


### Validacion cruzada

In [15]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
# Evaluar un modelo dado usando cross-validation

def evaluate_model(model, x, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

import statistics
acu_vc_10_test_voting = statistics.mean(evaluate_model(EnsambleVoting, x_test, y_test))

print('acu cv 10 test: ', acu_vc_10_test_voting)

acu cv 10 test:  0.5263675213675214


# **MODELO BAGGING: RF, AD, K-NN, SVC, MLP**

In [16]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier

### Bagging con **BOSQUE ALEATORIO**

In [17]:
EnsambleBaggingRF = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=80)
EnsambleBaggingRF.fit(x_train, y_train)

print('Score de testing: ', EnsambleBaggingRF.score(x_test, y_test))

  EnsambleBaggingRF.fit(x_train, y_train)


Score de testing:  0.6516290726817042


### Baggin con **MÁQUINA DE VECTOR DE SOPORTE**

In [18]:
BaggingSVC = BaggingClassifier(base_estimator=SVC(), n_estimators=200, random_state=0 )
BaggingSVC.fit(x_train, y_train)
print('Score de testing: ', BaggingSVC.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.45363408521303256


### Bagging con **ÁRBOL DE DECISIÓN**

In [19]:
BaggingDTC = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30, random_state=0)
BaggingDTC.fit(x_train, y_train)
print('Score de testing: ', BaggingDTC.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.6666666666666666


### Bagging con **N-VECINOS MAS CERCANOS**

In [20]:
BaggingKNN = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=30, random_state=0)
BaggingKNN.fit(x_train, y_train)
print('Score de testing: ', BaggingKNN.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.5037593984962406


### Bagging con **PERCEPTRÓN MULTICAPA - MLP**

In [22]:
BaggingMLP = BaggingClassifier(base_estimator=MLPClassifier(hidden_layer_sizes=(80, 40, 15, 5), alpha=0.25, activation='tanh'), n_estimators=20, random_state=0)
BaggingMLP.fit(x_train, y_train)
print('Score de testing: ', BaggingMLP.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.5889724310776943




# **3. BOOSTING**

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

### Boosting Gradiente

In [24]:
EnsambleBoosting = GradientBoostingClassifier(n_estimators=30)
EnsambleBoosting.fit(x_train, y_train)

print('Score de testing: ', EnsambleBoosting.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.6090225563909775


### AdaBoost

In [25]:
EnsambleABC = AdaBoostClassifier()
EnsambleABC.fit(x_train, y_train)

print('Score de testing: ', EnsambleABC.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Score de testing:  0.49874686716791977


# **4. STACKING: (KNN+AD+SCV+MLP) -> RF**

### Nivel 1: de clasificadores KNN, AD, SVC, MLP

In [26]:
estimadores = [('KNN', KNeighborsClassifier(n_neighbors=20, metric='euclidean')),
               ('AD', DecisionTreeClassifier(max_depth=50, criterion='entropy')),
               ('SVC', SVC(gamma=5, C=300, kernel='rbf')),
               ('MLP', MLPClassifier(hidden_layer_sizes=(80, 40, 15, 5), alpha=0.25, activation='tanh'))]

### Nivel2: predictor RF

In [27]:
estimador_final = RandomForestClassifier(n_estimators=300)

### Modelo Stacking

In [28]:
EnsambleStacking = StackingClassifier(
    estimators=estimadores,
    final_estimator=estimador_final
)
EnsambleStacking.fit(x_train, y_train)

print('Score de testing: ', EnsambleStacking.score(x_test, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Score de testing:  0.6541353383458647


In [29]:
import statistics

acu_vc_10_test_stacking = statistics.mean(evaluate_model(EnsambleStacking, x_test, y_test))
print('acu cv 10 test: ', acu_vc_10_test_stacking)

acu cv 10 test:  0.5523290598290598
