In [31]:
## CARGA DE LAS LIBRERIAS PRINCIPALES
import matplotlib.pyplot as plt #Libreria para graficar
import pandas as pd # Liberia para el manejo de datasets
import numpy as np # Libreria para el manejo de numeros

import sklearn.metrics as metrics 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.utils import resample, shuffle
from scipy import arange

## Carga de la librería del Clasificador
from sklearn.tree import DecisionTreeClassifier #Libreria para el Clasificador
from sklearn.ensemble import AdaBoostClassifier #Libreria para el AdaBoost

**Descripción del Conjunto de Datos**
Los dos conjuntos de datos están relacionados con variantes blancas del vino portugués "Vinho Verde". Para más detalles, consulte: [Enlace web] o la referencia [Cortez et al., 2009]. Debido a cuestiones de privacidad y logística, solo están disponibles las variables fisicoquímicas (insumos) y sensoriales (la salida) (por ejemplo, no hay datos sobre los tipos de uva, la marca del vino, el precio de venta del vino, etc.).

Estos conjuntos de datos se pueden ver como tareas de clasificación o regresión. Las clases están ordenadas y no equilibradas (por ejemplo, hay más vinos más normales que excelentes o malos). 

**Definición del Problema**  
Se define el problema como una Clasificación Multiclase, en donde se tiene un conjunto de 11 variables de entrada cuantitativas de diferentes mediciones fisicoquimas de diferentes clases de Vino Blanco, y una variable de salida cualitativa que clasifica el vino en 10 categorías diferentes. 

El problema en implementar un método de clasificación que dado el conjunto de variables cuantitativas, entregue cual es la calificación del vino. 

**Variable de entradan basadas en pruebas fisicoquimicas:**   
1 - Fixed acidity
2 - Volatile acidity
3 - Citric acid
4 - Residual sugar  
5 - Chlorides  
6 - Free sulfur dioxide  
7 - Total sulfur dioxide   
8 - Density  
9 - pH  
10 - Sulphates  
11 - Alcohol  

** Variables de salida basadas en información sensorial: **   
12 - Quality (score between 0 and 10)  

Fuente:  
Paulo Cortez (Univ. Minho), Antonio Cerdeira, Fernando Almeida, Telmo Matos and Jose Reis (CVRVV) @ 2009

In [32]:
# Lectura de los datos
data = pd.read_csv('winequality-red.csv', sep=';')
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [33]:
## DEFINICION DE LAS FUNCIONES PRINCIPALES

# Funcion para balancear el numero de clases de cada categoria
def balanceData(data, target_class):
    Y = data[target_class].astype('category')
    
    balanced_data = pd.DataFrame([])
    classes_quantity = table(Y)
    samples = classes_quantity[max(classes_quantity, key=classes_quantity.get)]
    
    for attribute in set(Y):
        current_attribute = data.query(target_class + ' == ' + str(attribute))
        if current_attribute.shape[0] != samples:
            upsample_class = resample(current_attribute, replace = True, n_samples = samples, random_state = 123) 
        else:
            upsample_class = current_attribute
        balanced_data = balanced_data.append(upsample_class)
    return shuffle(balanced_data, random_state = 0)

#Funcion para generar un diccionario con las clases y su cantidad en el dataset
def table(x):
    if not isinstance(x, list):
        x = x.values.tolist()
    return {a:x.count(a) for a in set(x)}

#Funcion para general los analisis de rendimiento de un clasificador
def classifierMetrix(classifier, X_Test, Y_Test):
    test_predictions = classifier.predict(X_Test)
    test_score = cross_val_score(classifier, X_Test, Y_Test, cv = n_cross_train)

    test_accuracy_score = metrics.accuracy_score(Y_Test, test_predictions)
    print('Test Accuracy Score:', test_accuracy_score)
    print()
    
    test_confusion_matrix = metrics.confusion_matrix(Y_Test, test_predictions)
    print("Matriz de Confusion")
    print(test_confusion_matrix)
    print()

    test_classification_report = metrics.classification_report(Y_Test, test_predictions)
    print("Reporte de Clasificacion")
    print(test_classification_report)
    print()
        
    return test_accuracy_score, test_classification_report
    
#Función para mostrar los resultados de la Optimización de Parámetros
def parametersOptimizationResults(classifier, features):
    print("Mejor conjunto de parametros:", classifier.best_params_)
    print()
    
    params = classifier.cv_results_['params']
    
    means_train = classifier.cv_results_['mean_train_score']
    stds_train  = classifier.cv_results_['std_train_score']
    print('Train Scores')
    printScores(means_train, stds_train, params)
    
    means_test = classifier.cv_results_['mean_test_score']
    stds_test  = classifier.cv_results_['std_test_score']
    print('Test Scores')
    printScores(means_test, stds_test, params)
    
    if hasattr(classifier, 'feature_importances_'):
        importances = classifier.feature_importances_
        features_list = dict(list(zip(features, importances)))
        print(features_list)
    
    return [means_train, means_test]

#Funcion para imprimir los resultados 
def printScores(means, stds, clf_params):
    for mean, std, params in zip(means, stds, clf_params):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

#Función para graficar las importancia de las variables (Tipo Arboles de Decisión)
def plotFeaturesImportance(classifier, X):
    importances = classifier.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    print("Ranking de Variables de Entrada:")
    for f in range(X.shape[1]):
        print("%d. Variable %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
        
    if hasattr(classifier, 'estimators_'):
        std = np.std([tree.feature_importances_ for tree in classifier.estimators_],axis=0)
        plt.figure()
        plt.title("Importancia de Variables de Entrada")
        plt.bar(range(X.shape[1]), importances[indices],color="r", yerr=std[indices], align="center")
        plt.xticks(range(X.shape[1]), indices)
        plt.xlim([-1, X.shape[1]])
        plt.grid()
        plt.show()

In [34]:
# Distribucion de los datos en el conjunto original
table(data.quality)

{3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18}

In [35]:
# Balanceo de las diferentes clases
X_balanced = balanceData(data, 'quality')
X_balanced.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0,4086.0
mean,8.327264,0.575724,0.270649,2.581118,0.089932,14.009545,38.057513,0.996503,3.329075,0.666148,10.712004,5.5
std,1.819496,0.257086,0.218687,1.410504,0.055529,10.253335,30.035723,0.002067,0.169892,0.180457,1.240909,1.708034
min,4.6,0.12,0.0,1.2,0.012,1.0,6.0,0.9902,2.74,0.33,8.4,3.0
25%,7.1,0.38,0.05,1.8,0.068,6.0,16.0,0.99516,3.23,0.55,9.8,4.0
50%,7.9,0.53,0.27,2.2,0.078,11.0,29.0,0.9966,3.32,0.63,10.5,5.5
75%,9.5,0.69,0.45,2.6,0.09,19.0,49.0,0.9978,3.43,0.7575,11.5,7.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.0032,4.01,2.0,14.9,8.0


In [36]:
# Particion entre los Atributos (X) y los Targets (Y) 
Y = X_balanced.quality.astype('category')
X = X_balanced.drop('quality', 1)

In [37]:
# Distribucion de los datos en el conjunto balanceado
table(Y)

{3: 681, 4: 681, 5: 681, 6: 681, 7: 681, 8: 681}

In [38]:
# Normalizacion de los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [39]:
# Generecion de los valores para conjunto de entrenamiento y prueba
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0, shuffle=True, stratify=Y)

# Reinicio de los indices de Pandas
Y_Test = Y_Test.reset_index(drop=True)
Y_Train = Y_Train.reset_index(drop=True)

In [47]:
n_cross_train = int(np.ceil(0.01*X_Train.shape[0]))

In [69]:
#Estimación de Parámetros Robusta Usando GridSearchCV y Cross Validation
parameters = {'base_estimator':[DecisionTreeClassifier()], 'learning_rate':[0.1,3.0], 'n_estimators':[100,500]}
adaboost_clf = GridSearchCV(AdaBoostClassifier(), parameters, cv = n_cross_train, return_train_score = True)
adaboost_clf = adaboost_clf.fit(X_Train,Y_Train)

In [70]:
#Resultados de la Estimación de Parámetros Robusta
ab_results = parametersOptimizationResults(adaboost_clf, data.columns)

Mejor conjunto de parametros: {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'learning_rate': 3.0, 'n_estimators': 100}

Train Scores
1.000 (+/-0.000) for {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'learning_rate': 0.1, 'n_estimators': 100}
1.000 (+/-0.000) for {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth

In [71]:
#Reporte del Rendimiento del Clasificador
adaboost_accuracy, adaboost_report = classifierMetrix(adaboost_clf, X_Test, Y_Test)

Test Accuracy Score: 0.9388753056234719

Matriz de Confusion
[[137   0   0   0   0   0]
 [  0 136   0   0   0   0]
 [  2   2 112  14   6   0]
 [  1   2  14 111   5   3]
 [  0   0   1   0 136   0]
 [  0   0   0   0   0 136]]

Reporte de Clasificacion
             precision    recall  f1-score   support

          3       0.98      1.00      0.99       137
          4       0.97      1.00      0.99       136
          5       0.88      0.82      0.85       136
          6       0.89      0.82      0.85       136
          7       0.93      0.99      0.96       137
          8       0.98      1.00      0.99       136

avg / total       0.94      0.94      0.94       818


