# Grupo 1 - Smog predicition
## Modelo AdaBoost

### Análisis y limpieza de datos

In [None]:
#General imports
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report

#Specific imports
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, precision_score, recall_score


#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)

%matplotlib inline

df = pd.read_csv('data/train.csv')

df['Gears'] = df['Transmission'].str.extract('(\d+)')
df['Gears'] = pd.to_numeric(df['Gears'], errors='coerce')
df['Transmission'] = df['Transmission'].str.extract('(\D+)')

In [None]:
#Fuel Type
df.loc[df["Fuel Type"] == "X", "Fuel Type"] = 0
df.loc[df["Fuel Type"] == "Z", "Fuel Type"] = 1
df.loc[df["Fuel Type"] == "D", "Fuel Type"] = 2
df.loc[df["Fuel Type"] == "E", "Fuel Type"] = 3
df.loc[df["Fuel Type"] == "N", "Fuel Type"] = 4

#Transmission
df.loc[df["Transmission"] == "A", "Transmission"] = 0
df.loc[df["Transmission"] == "AM", "Transmission"] = 1
df.loc[df["Transmission"] == "AS", "Transmission"] = 2
df.loc[df["Transmission"] == "AV", "Transmission"] = 3
df.loc[df["Transmission"] == "M", "Transmission"] = 4


#Vehicle Class
df.loc[df["Vehicle Class"] == "Compact", "Vehicle Class"] = 0
df.loc[df["Vehicle Class"] == "Full-size", "Vehicle Class"] = 1
df.loc[df["Vehicle Class"] == "Mid-size", "Vehicle Class"] = 2
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 3
df.loc[df["Vehicle Class"] == "Minivan", "Vehicle Class"] = 4
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 5
df.loc[df["Vehicle Class"] == "Pickup truck: Small", "Vehicle Class"] = 6
df.loc[df["Vehicle Class"] == "Pickup truck: Standard", "Vehicle Class"] = 7
df.loc[df["Vehicle Class"] == "SUV: Small", "Vehicle Class"] = 8
df.loc[df["Vehicle Class"] == "SUV: Standard", "Vehicle Class"] = 9
df.loc[df["Vehicle Class"] == "Special purpose vehicle", "Vehicle Class"] = 10
df.loc[df["Vehicle Class"] == "Station wagon: Mid-size", "Vehicle Class"] = 11
df.loc[df["Vehicle Class"] == "Station wagon: Small", "Vehicle Class"] = 12
df.loc[df["Vehicle Class"] == "Subcompact", "Vehicle Class"] = 13
df.loc[df["Vehicle Class"] == "Two-seater", "Vehicle Class"] = 14

df = df.dropna(subset=['Gears'])

In [None]:
df.drop("Model Year", axis=1, inplace=True)
df.drop("Make", axis=1, inplace=True)
df.drop("Model", axis=1, inplace=True)
df.drop("Comb (mpg)", axis=1, inplace=True)
df.drop("Fuel Consumption City (L/100 km)", axis=1, inplace=True)
df.drop("Hwy (L/100 km)", axis=1, inplace=True)

df.head()

Con esto, hemos terminado el análisis y la limpieza de los datos de ambos dataframes.

# Definión y entrenamiento del modelo

In [None]:
features = ['Vehicle Class', 'Engine Size (L)', 'Cylinders', 'Transmission', 'Fuel Type', 'Comb (L/100 km)', 'CO2 Emissions (g/km)', 'Gears']

x = df[features].values
y = df['Smog'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

# Preprocess: normalize
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
 
rfc = RandomForestClassifier(n_estimators=70, random_state=100)
 
model = AdaBoostClassifier(estimator=rfc, n_estimators=90, learning_rate=0.01, random_state=33)
 
model.fit(x_train, y_train)

### Comprobación de resultados

In [None]:
# Evaluate Accuracy in training
predicted = model.predict(x_test)
expected = y_test

from sklearn import metrics
y_train_pred = model.predict(x_train)
print("Accuracy in training", metrics.accuracy_score(y_train, y_train_pred))

# Now we evaluate error in testing
y_test_pred = model.predict(x_test)
print("Accuracy in testing ", metrics.accuracy_score(y_test, y_test_pred))

Obtenemos una exactitud de casi el 77%.

In [None]:
s_y_test = Series(y_test)
s_y_test.value_counts()

s_y_test.value_counts().head(1) / len(y_test)

Al cumplir la exactitud nula podemos deducir que el modelo no se encuentra sobreajustado a nuestro conjunto de datos.

Vamos con el F1-score:

In [None]:
print(classification_report(expected, predicted))

In [None]:
# Conjunto de hiperparámetros a probar
tuned_hyperparameters = [{'n_estimators':  [10, 30, 50, 70, 90],
                          'learning_rate': [0.01, 0.1, 1.0],
                          'estimator': [rfc],
                          'random_state': [33]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Ajuste de hiperparámetros para %s" % score)
    print()

    if score == 'precision':
        scorer = make_scorer(precision_score, average='weighted', zero_division=0)
    elif score == 'recall':
        scorer = make_scorer(recall_score, average='weighted', zero_division=0)

    gs = GridSearchCV(AdaBoostClassifier(), tuned_hyperparameters, cv=10, scoring=scorer)
    gs.fit(x_train, y_train)

    print("Mejor conjunto de hiperparámetros encontrado en el conjunto de desarrollo:")
    print()
    print(gs.best_params_)
    print()
    print("Puntuaciones en la cuadrícula en el conjunto de desarrollo:")
    print()
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']

    for mean_score, std_score, params in zip(means, stds, gs.cv_results_['params']):
        print("%0.3f (+/-%0.03f) para %r" % (mean_score, std_score * 2, params))
    print()

    print("Informe de clasificación detallado:")
    print()
    print("El modelo se entrena en el conjunto de desarrollo completo.")
    print("Las puntuaciones se calculan en el conjunto de evaluación completo.")
    print()
    y_true, y_pred = y_test, gs.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
# We print the score for each value of max_depth
for i, max_depth in enumerate(gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (gs.cv_results_['mean_test_score'][i],
                                        gs.cv_results_['std_test_score'][i] * 2,
                                        max_depth))

In [None]:
# Crear un estimador compuesto mediante un pipeline de preprocesamiento y el modelo AdaBoost
model = Pipeline([
        ('scaler', StandardScaler()),
        ('ab', AdaBoostClassifier(**gs.best_params_))
])

# Ajustar el modelo
model.fit(x_train, y_train) 

# Crear un iterador de validación cruzada k-fold con k=10 pliegues
cv = KFold(10, shuffle=True, random_state=33)

# Por defecto, la puntuación utilizada es la devuelta por el método score del estimador (precisión)
scores = cross_val_score(model, x, y, cv=cv)
def mean_score(scores):
    return ("Puntuación media: {0:.3f} (+/- {1:.3f})").format(np.mean(scores), sem(scores))
print(mean_score(scores))
