# Grupo 1 - Smog predicition
## Modelo SVM

### Análisis y limpieza de datos

In [3]:
#Imports generales
import numpy as np
import pandas as pd
from pandas import Series
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

#Imports específicos
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import classification_report, recall_score, precision_score, make_scorer
from sklearn.preprocessing import StandardScaler
from scipy.stats import sem

#Visualización
import seaborn as sns
sns.set(color_codes=True)

%matplotlib inline

df = pd.read_csv('data/train.csv')

df['Gears'] = df['Transmission'].str.extract('(\d+)')
df['Gears'] = pd.to_numeric(df['Gears'], errors='coerce')
df['Transmission'] = df['Transmission'].str.extract('(\D+)')

In [4]:
#Fuel Type
df.loc[df["Fuel Type"] == "X", "Fuel Type"] = 0
df.loc[df["Fuel Type"] == "Z", "Fuel Type"] = 1
df.loc[df["Fuel Type"] == "D", "Fuel Type"] = 2
df.loc[df["Fuel Type"] == "E", "Fuel Type"] = 3
df.loc[df["Fuel Type"] == "N", "Fuel Type"] = 4

#Transmission
df.loc[df["Transmission"] == "A", "Transmission"] = 0
df.loc[df["Transmission"] == "AM", "Transmission"] = 1
df.loc[df["Transmission"] == "AS", "Transmission"] = 2
df.loc[df["Transmission"] == "AV", "Transmission"] = 3
df.loc[df["Transmission"] == "M", "Transmission"] = 4


#Vehicle Class
df.loc[df["Vehicle Class"] == "Compact", "Vehicle Class"] = 0
df.loc[df["Vehicle Class"] == "Full-size", "Vehicle Class"] = 1
df.loc[df["Vehicle Class"] == "Mid-size", "Vehicle Class"] = 2
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 3
df.loc[df["Vehicle Class"] == "Minivan", "Vehicle Class"] = 4
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 5
df.loc[df["Vehicle Class"] == "Pickup truck: Small", "Vehicle Class"] = 6
df.loc[df["Vehicle Class"] == "Pickup truck: Standard", "Vehicle Class"] = 7
df.loc[df["Vehicle Class"] == "SUV: Small", "Vehicle Class"] = 8
df.loc[df["Vehicle Class"] == "SUV: Standard", "Vehicle Class"] = 9
df.loc[df["Vehicle Class"] == "Special purpose vehicle", "Vehicle Class"] = 10
df.loc[df["Vehicle Class"] == "Station wagon: Mid-size", "Vehicle Class"] = 11
df.loc[df["Vehicle Class"] == "Station wagon: Small", "Vehicle Class"] = 12
df.loc[df["Vehicle Class"] == "Subcompact", "Vehicle Class"] = 13
df.loc[df["Vehicle Class"] == "Two-seater", "Vehicle Class"] = 14

#Gears
df['Gears'] = df['Gears'].fillna(df['Gears'].mean())

In [5]:
df.drop("Model Year", axis=1, inplace=True)
df.drop("Make", axis=1, inplace=True)
df.drop("Model", axis=1, inplace=True)
df.drop("Comb (mpg)", axis=1, inplace=True)
df.drop("Fuel Consumption City (L/100 km)", axis=1, inplace=True)
df.drop("Hwy (L/100 km)", axis=1, inplace=True)

df.head()

Unnamed: 0,id,Vehicle Class,Engine Size (L),Cylinders,Transmission,Fuel Type,Comb (L/100 km),CO2 Emissions (g/km),Smog,Gears
0,ab44e9bec15,12,2.0,4,1,1,8.7,202,2,7.0
1,45926762371,2,2.0,4,2,0,7.7,181,4,6.0
2,e9be56e153f,1,2.9,6,1,1,11.7,274,2,8.0
3,077092760df,0,2.0,4,2,0,8.1,189,1,6.0
4,c1c2579b795,3,5.2,12,0,1,13.8,324,1,8.0


Con esto, hemos terminado el análisis y la limpieza de los datos de ambos dataframes.

### Definición y entrenamiento del modelo

En este *notebook* realizaremos el entrenamiento de un modelo de SVM, el cual entrenaremos empleando los campos 'Vehicle Class', 'Engine Size (L)', 'Cylinders', 'Transmission', 'Fuel Type', 'CO2 Emissions (g/km)' y 'Gears'.

En primer lugar separaremos el conjunto de datos en subconjuntos de entrenamiento y prueba para realizar una primera aproximación.

In [6]:
features = ['Vehicle Class', 'Engine Size (L)', 'Cylinders', 'Transmission', 'Fuel Type', 'Comb (L/100 km)', 'CO2 Emissions (g/km)', 'Gears']

x = df[features].values
y = df['Smog'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

# Normalización
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

Y una vez seleccionados los datos, entrenamos el modelo.

In [7]:
model = SVC(kernel='linear', probability=True, gamma=3.0)

model.fit(x_train, y_train)

SVC(gamma=3.0, kernel='linear', probability=True)

Inicialmente se usan estos hiperparámetros para el entrenamiento. Posteriormente los ajustaremos para obtener una mayor eficiencia.

Tras definir el modelo se va a pasar a la fase de entrenamiento.

### Comprobación de resultados

Para la comprobación de resultados se van a calcular varias métricas, obteniendo la eficacia de nuestro modelo.

In [8]:
# Evaluar la Exactitud en el entrenamiento
predicted = model.predict(x_test)
expected = y_test

y_train_pred = model.predict(x_train)
print("Accuracy in training", metrics.accuracy_score(y_train, y_train_pred))

# También vamos a evaluar el error en las pruebas
y_test_pred = model.predict(x_test)
print("Accuracy in testing ", metrics.accuracy_score(y_test, y_test_pred))

Accuracy in training 0.6173120728929385
Accuracy in testing  0.5374149659863946


Nuestra exactitud es bastante mala... Obteniendo un 54% de exactitud.

In [9]:
s_y_test = Series(y_test)
s_y_test.value_counts()

s_y_test.value_counts().head(1) / len(y_test)

2    0.408163
dtype: float64

Al cumplir la exactitud nula podemos deducir que el modelo no se encuentra desbalanceado en nuestro conjunto de datos.

Vamos con el F1 Score:

In [10]:
print(classification_report(expected, predicted))

              precision    recall  f1-score   support

           0       0.33      0.14      0.20         7
           1       0.53      0.68      0.59        28
           2       0.61      0.47      0.53        60
           3       0.52      0.54      0.53        28
           4       0.48      0.67      0.56        24

    accuracy                           0.54       147
   macro avg       0.49      0.50      0.48       147
weighted avg       0.54      0.54      0.53       147



El resto de métricas calculadas también son muy negativas...

Ahora, vamos a probar a entrenarlo y evaluarlo con K-Fold para ver si puede mejorar los resultados obtenidos.

In [11]:
cv = KFold(n_splits=10, shuffle=True, random_state=33)

scores = cross_val_score(model, x, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Scores in every iteration [0.6440678  0.50847458 0.47457627 0.61016949 0.50847458 0.44067797
 0.55172414 0.60344828 0.70689655 0.53448276]
Accuracy: 0.56 (+/- 0.16)


Ha mejorado ligeramente pero sigue siendo un resultado muy negativo...

### Ajuste del algoritmo

Se va a utilizar Grid Search para realizar una optimización de hiperparámetros.

In [12]:
# Conjunto de hiperparámetros a probar
tuned_hyperparameters = [{'C': [0.1, 1, 10, 100],
                          'kernel': ['linear', 'rbf', 'poly'],
                          'gamma': ['scale', 'auto'],
                          'class_weight': ['balanced', None]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyperparameters for %s" % score)
    print()

    if score == 'precision':
        scorer = make_scorer(precision_score, average='weighted', zero_division=0)
    elif score == 'recall':
        scorer = make_scorer(recall_score, average='weighted', zero_division=0)
    
    gs = GridSearchCV(SVC(), tuned_hyperparameters, cv=10, scoring=scorer)
    gs.fit(x_train, y_train)

    print("Best hyperparameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']

    for mean_score, std_score, params in zip(means, stds, gs.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, std_score * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, gs.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyperparameters for precision

Best hyperparameters set found on development set:

{'C': 100, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}

Grid scores on development set:

0.584 (+/-0.219) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
0.580 (+/-0.173) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}
0.648 (+/-0.167) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'poly'}
0.584 (+/-0.219) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'linear'}
0.580 (+/-0.171) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}
0.639 (+/-0.170) for {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'poly'}
0.559 (+/-0.151) for {'C': 0.1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'linear'}
0.584 (+/-0.125) for {'C': 0.1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}
0.618 (+/-0.150) for {'C': 0.1, 'class_weigh

### Comprobación con el algoritmo ajustado

A partir de los resultados anteriores, volvemos a entrenar el modelo mediante validación con K-Fold para comprobar la nueva media de puntuación.

In [13]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(**gs.best_params_))
])

model.fit(x_train, y_train)

cv = KFold(10, shuffle=True, random_state=33)

scores = cross_val_score(model, x, y, cv=cv)
def mean_score(scores):
    return ("Mean score: {0:.3f} (+/- {1:.3f})").format(np.mean(scores), sem(scores))
print(mean_score(scores))

Mean score: 0.727 (+/- 0.019)


Con el algoritmo ajustado, obtenemos una media del 73%, lo cual mejora notablemente el modelo.