In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
import seaborn as sb
import random
import graphviz

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')

# Importar data

In [None]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]


In [None]:
random.seed(123)

# Categorizar la variable SalePrice

In [None]:
def categorize(row): 
    if row['SalePrice'] > 0 and row['SalePrice'] <= 179280:
        return 'Low'
    elif row['SalePrice'] > 179280 and row['SalePrice'] < 326100:
        return 'Medium'
    else:
        return 'Expensive'
   
train_data['SalesCategories'] = train_data.apply(lambda row: categorize(row), axis=1)


print(train_data.groupby('SalesCategories').size().sort_values(ascending=False))


In [None]:
copied_train_data = train_data.copy()
copied_train_data = copied_train_data.fillna(0)


y = copied_train_data.pop("SalesCategories") #La variable respuesta
X = copied_train_data[quant_vars] #El resto de los datos

# Entrenamiento y Prueba

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7)

In [None]:
arbol = DecisionTreeClassifier(max_depth=4, random_state=42) 
arbol = arbol.fit(X_train, y_train) 

In [None]:
tree.plot_tree(arbol,feature_names=copied_train_data.columns,
               class_names=['0','1','2'],filled=True )

In [None]:
y_pred = arbol.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print ("Precision:", metrics.precision_score(y_test,y_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test,y_pred,average='weighted'))


#### Arbol de regresion

In [None]:
regressionTree = DecisionTreeRegressor(max_depth=4, random_state=42) 
regressionTree = arbol.fit(X_train, y_train) 

In [None]:
tree.plot_tree(regressionTree,feature_names=copied_train_data.columns,
               class_names=['0','1','2'],filled=True )

In [None]:
y_pred = regressionTree.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print ("Precision:", metrics.precision_score(y_test,y_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test,y_pred,average='weighted'))


#### Bosque Aleatorio

In [None]:
randomForest = RandomForestClassifier(max_depth=4, random_state=42) 
randomForest = arbol.fit(X_train, y_train) 

In [None]:
tree.plot_tree(randomForest,feature_names=copied_train_data.columns,
               class_names=['0','1','2'],filled=True )

In [None]:
y_pred = randomForest.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print ("Precision:", metrics.precision_score(y_test,y_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test,y_pred,average='weighted'))
