<h2>Comparativa entre ÁRBOLES DE DECISIÓN y RANDOM FOREST</h2>

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
random_seed = 50

<h3>Árboles de decisión</h3>

In [18]:
df = pd.read_csv('temps.csv')
df.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


<h4>Estadísticas del dataset</h4>

In [19]:
df.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


<h4>Modificación de las Variables Discretas a Continuas</h4>

In [20]:
df = pd.get_dummies(df)

In [21]:
df.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0


<h4>División en atributos y clase</h4>

La clase a predecir en este caso es la que tiene la etiqueta <b>actual</b>, que es la temperatura actual.

In [22]:
# Clase --> Actual
# Atributos --> Resto

# Guardamos las clases en un array
clases = np.array(df['actual'])

# Eliminamos la columna clase del dataset
df = df.drop('actual', axis = 1)

# Lista de atributos
lista_atributos = list(df.columns)

df = np.array(df)

<h4>Division Train/Test</h4>

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df, clases, test_size = 0.25, random_state = random_seed)

<h4>RandomForest </h4>

En este apartado se van a realizar ejecuciones sobre los datos anteriores, intentando ejemplificar cuales son mejores.

In [24]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
pred = rf.predict(X_test)

In [26]:
errores = abs(pred - y_test)
acierto = 100 - np.mean(100 * (errores / y_test))
print("Porcentaje acierto: ", acierto)

Porcentaje acierto:  92.3335972022384


<h4>Visualización del árbol generado</h4>

In [27]:
from sklearn.tree import export_graphviz
import pydot

tree = rf.estimators_[5]
export_graphviz(tree, out_file = 'tree.dot', feature_names = lista_atributos, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

<h4>Limitando el tamaño del clasificador</h4>

In [31]:
rf_limitado = RandomForestClassifier(n_estimators = 10, max_depth = 2)
rf_limitado.fit(X_train, y_train)

tree_limitado = rf_limitado.estimators_[5]
export_graphviz(tree_limitado, out_file = 'tree_limitado.dot', feature_names = lista_atributos, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('tree_limitado.dot')
graph.write_png('tree_limitado.png')


<h4>Importancia de los atributos</h4>

In [49]:
lista_atributos_importantes = list(rf_limitado.feature_importances_)
importancia_atributo = [(atributo, round(importancia, 4)) for atributo, importancia in zip(lista_atributos, lista_atributos_importantes)]

importancia_atributo = sorted(importancia_atributo, key = lambda x: x[1], reverse = True)

#[print('Variable: {:15} Importance: {}'.format(*pair)) ]

for pair in importancia_atributo:
    print("Variable: ",pair[0])
    print("  Importancia: ", pair[1])

Variable:  average
  Importancia:  0.2933
Variable:  temp_1
  Importancia:  0.1725
Variable:  forecast_noaa
  Importancia:  0.1313
Variable:  forecast_acc
  Importancia:  0.122
Variable:  temp_2
  Importancia:  0.1105
Variable:  forecast_under
  Importancia:  0.0894
Variable:  month
  Importancia:  0.0446
Variable:  week_Fri
  Importancia:  0.0363
Variable:  year
  Importancia:  0.0
Variable:  day
  Importancia:  0.0
Variable:  friend
  Importancia:  0.0
Variable:  week_Mon
  Importancia:  0.0
Variable:  week_Sat
  Importancia:  0.0
Variable:  week_Sun
  Importancia:  0.0
Variable:  week_Thurs
  Importancia:  0.0
Variable:  week_Tues
  Importancia:  0.0
Variable:  week_Wed
  Importancia:  0.0
