# Grupo 1 - Smog predicition
## Modelo AdaBoost

### Análisis y limpieza de datos

In [677]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)

%matplotlib inline

df = pd.read_csv('data/train.csv')

df['Gears'] = df['Transmission'].str.extract('(\d+)')
df['Gears'] = pd.to_numeric(df['Gears'], errors='coerce')
df['Transmission'] = df['Transmission'].str.extract('(\D+)')

In [678]:
#Fuel Type
df.loc[df["Fuel Type"] == "X", "Fuel Type"] = 0
df.loc[df["Fuel Type"] == "Z", "Fuel Type"] = 1
df.loc[df["Fuel Type"] == "D", "Fuel Type"] = 2
df.loc[df["Fuel Type"] == "E", "Fuel Type"] = 3
df.loc[df["Fuel Type"] == "N", "Fuel Type"] = 4

#Transmission
df.loc[df["Transmission"] == "A", "Transmission"] = 0
df.loc[df["Transmission"] == "AM", "Transmission"] = 1
df.loc[df["Transmission"] == "AS", "Transmission"] = 2
df.loc[df["Transmission"] == "AV", "Transmission"] = 3
df.loc[df["Transmission"] == "M", "Transmission"] = 4


#Vehicle Class
df.loc[df["Vehicle Class"] == "Compact", "Vehicle Class"] = 0
df.loc[df["Vehicle Class"] == "Full-size", "Vehicle Class"] = 1
df.loc[df["Vehicle Class"] == "Mid-size", "Vehicle Class"] = 2
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 3
df.loc[df["Vehicle Class"] == "Minivan", "Vehicle Class"] = 4
df.loc[df["Vehicle Class"] == "Minicompact", "Vehicle Class"] = 5
df.loc[df["Vehicle Class"] == "Pickup truck: Small", "Vehicle Class"] = 6
df.loc[df["Vehicle Class"] == "Pickup truck: Standard", "Vehicle Class"] = 7
df.loc[df["Vehicle Class"] == "SUV: Small", "Vehicle Class"] = 8
df.loc[df["Vehicle Class"] == "SUV: Standard", "Vehicle Class"] = 9
df.loc[df["Vehicle Class"] == "Special purpose vehicle", "Vehicle Class"] = 10
df.loc[df["Vehicle Class"] == "Station wagon: Mid-size", "Vehicle Class"] = 11
df.loc[df["Vehicle Class"] == "Station wagon: Small", "Vehicle Class"] = 12
df.loc[df["Vehicle Class"] == "Subcompact", "Vehicle Class"] = 13
df.loc[df["Vehicle Class"] == "Two-seater", "Vehicle Class"] = 14

df = df.dropna(subset=['Gears'])

In [679]:
df.drop("Model Year", axis=1, inplace=True)
df.drop("Make", axis=1, inplace=True)
df.drop("Model", axis=1, inplace=True)
df.drop("Comb (mpg)", axis=1, inplace=True)
df.drop("Fuel Consumption City (L/100 km)", axis=1, inplace=True)
df.drop("Hwy (L/100 km)", axis=1, inplace=True)

df.head()

Unnamed: 0,id,Vehicle Class,Engine Size (L),Cylinders,Transmission,Fuel Type,Comb (L/100 km),CO2 Emissions (g/km),Smog,Gears
0,ab44e9bec15,12,2.0,4,1,1,8.7,202,2,7.0
1,45926762371,2,2.0,4,2,0,7.7,181,4,6.0
2,e9be56e153f,1,2.9,6,1,1,11.7,274,2,8.0
3,077092760df,0,2.0,4,2,0,8.1,189,1,6.0
4,c1c2579b795,3,5.2,12,0,1,13.8,324,1,8.0


Con esto, hemos terminado el análisis y la limpieza de los datos de ambos dataframes.

# Definión y entrenamiento del modelo

In [680]:
features = ['Vehicle Class', 'Engine Size (L)', 'Cylinders', 'Transmission', 'Fuel Type', 'Comb (L/100 km)', 'CO2 Emissions (g/km)', 'Gears']

x = df[features].values
y = df['Smog'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

# Preprocess: normalize
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [681]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=50, learning_rate=1, random_state=33)

model.fit(x_train, y_train)

AdaBoostClassifier(learning_rate=1, random_state=33)

### Comprobación de resultados

In [682]:
# Evaluate Accuracy in training
predicted = model.predict(x_test)
expected = y_test

from sklearn import metrics
y_train_pred = model.predict(x_train)
print("Accuracy in training", metrics.accuracy_score(y_train, y_train_pred))

# Now we evaluate error in testing
y_test_pred = model.predict(x_test)
print("Accuracy in testing ", metrics.accuracy_score(y_test, y_test_pred))

Accuracy in training 0.5153664302600472
Accuracy in testing  0.5319148936170213


Obtenemos una exactitud del 53%.

In [683]:
s_y_test = Series(y_test)
s_y_test.value_counts()

s_y_test.value_counts().head(1) / len(y_test)

2    0.35461
dtype: float64

Al cumplir la exactitud nula podemos deducir que el modelo no se encuentra sobreajustado a nuestro conjunto de datos.

Vamos con el F1-score:

In [684]:
print(classification_report(expected, predicted))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71         6
           1       0.54      0.48      0.51        27
           2       0.49      0.40      0.44        50
           3       0.52      0.58      0.55        24
           4       0.56      0.68      0.61        34

    accuracy                           0.53       141
   macro avg       0.55      0.59      0.57       141
weighted avg       0.53      0.53      0.53       141

