In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('Daegu_Real_Estate_data.csv')

In [None]:
df

# Parte 1: Análisis descriptivo de los datos

In [None]:
print("Los datos están conformados por ",df.shape[0], " filas y ", df.shape[1], " columnas")
df.head(10)

## Descripción de variables: 

*SalePrice*: Precio de venta del apartamento en dólares.

*YearBuilt*: Año de construcción del apartamento.

*YrSold*: Año de venta del apartamento.

*MonthSold*: Mes de venta del apartamento.

*Size(sqf)*: Tamaño del apartamento en pies cuadrados.

*Floor*: Número de piso donde se encuentra el apartamento.

*HallwayType*: Tipo de pasillo en el edificio donde se encuentra el apartamento.

*HeatingType*: Tipo de calefacción en el apartamento.

*AptManageType*: Tipo de administración del apartamento.

*N_Parkinglot(Ground)*: Número de estacionamientos en la planta baja del edificio.

*N_Parkinglot(Basement)*: Número de estacionamientos en el sótano del edificio.

*TimeToBusStop*: Tiempo (en minutos) que toma llegar a la parada de autobús más cercana.

*TimeToSubway*: Tiempo (en minutos) que toma llegar a la estación de metro más cercana.

*N_APT*: Número de apartamentos en el edificio.

*N_manager*: Número de administradores del edificio.

*N_elevators*: Número de elevadores en el edificio.

*N_FacilitiesNearBy(PublicOffice)*: Número de oficinas públicas cercanas al apartamento.

*N_FacilitiesNearBy(Hospital)*: Número de hospitales cercanos al apartamento.

*N_FacilitiesNearBy(Dpartmentstore)*: Número de tiendas departamentales cercanas al apartamento.

*N_FacilitiesNearBy(Mall)*: Número de centros comerciales cercanos al apartamento.

*N_FacilitiesNearBy(ETC)*: Número de otras instalaciones cercanas al apartamento.

*N_FacilitiesNearBy(Park)*: Número de parques cercanos al apartamento.

*N_SchoolNearBy(Elementary)*: Número de escuelas primarias cercanas al apartamento.

*N_SchoolNearBy(Middle)*: Número de escuelas secundarias cercanas al apartamento.

*N_SchoolNearBy(High)*: Número de escuelas preparatorias cercanas al apartamento.

*N_SchoolNearBy(University)*: Número de universidades cercanas al apartamento.

*N_FacilitiesInApt*: Número de instalaciones dentro del apartamento.

*N_FacilitiesNearBy(Total)*: Número total de instalaciones cercanas al apartamento.

*N_SchoolNearBy(Total)*: Número total de escuelas cercanas al apartamento.

## Recuento de variables nulas:

In [None]:
df.info()

In [None]:
# Los datos no tienen valores nulos, el mejor de los casos.
df.isnull().sum()

## Histograma de frecuencias

In [None]:
sns.set()
fig = plt.figure(figsize = [35,45])
cols = [
    "SalePrice",
    "YearBuilt",
    "YrSold",
    "MonthSold",
    "Size(sqf)",
    "Floor",
    "HallwayType",
    "HeatingType",
    "AptManageType",
    "N_Parkinglot(Ground)",
    "N_Parkinglot(Basement)",
    "TimeToBusStop",
    "TimeToSubway",
    "N_APT",
    "N_manager",
    "N_elevators",
    "SubwayStation",
    "N_FacilitiesNearBy(PublicOffice)",
    "N_FacilitiesNearBy(Hospital)",
    "N_FacilitiesNearBy(Dpartmentstore)",
    "N_FacilitiesNearBy(Mall)",
    "N_FacilitiesNearBy(ETC)",
    "N_FacilitiesNearBy(Park)",
    "N_SchoolNearBy(Elementary)",
    "N_SchoolNearBy(Middle)",
    "N_SchoolNearBy(High)",
    "N_SchoolNearBy(University)",
    "N_FacilitiesInApt",
    "N_FacilitiesNearBy(Total)",
    "N_SchoolNearBy(Total)"
]
cnt = 1
for col in cols :
    plt.subplot(6,5,cnt)
    sns.histplot(df[col], kde = True)
    cnt+=1
plt.show()

## Gráfico de variables vs precio de venta

In [None]:
fig = plt.figure(figsize = [25,100])
cols = [
    'YearBuilt',
    'YrSold',
    'MonthSold',
    'Size(sqf)',
    'Floor',
    'HallwayType',
    'HeatingType',
    'AptManageType',
    'N_Parkinglot(Ground)',
    'N_Parkinglot(Basement)',
    'TimeToBusStop',
    'TimeToSubway',
    'N_APT',
    'N_manager',
    'N_elevators',
    'SubwayStation',
    'N_FacilitiesNearBy(PublicOffice)',
    'N_FacilitiesNearBy(Hospital)',
    'N_FacilitiesNearBy(Dpartmentstore)',
    'N_FacilitiesNearBy(Mall)',
    'N_FacilitiesNearBy(ETC)',
    'N_FacilitiesNearBy(Park)',
    'N_SchoolNearBy(Elementary)',
    'N_SchoolNearBy(Middle)',
    'N_SchoolNearBy(High)',
    'N_SchoolNearBy(University)',
    'N_FacilitiesInApt',
    'N_FacilitiesNearBy(Total)',
    'N_SchoolNearBy(Total)'
]
cnt = 1
for col in cols :
    plt.subplot(15,2,cnt)
    sns.barplot(data = df, x = col, y = 'SalePrice').set(title='SalePrice vs {0}'.format(col))
    cnt+=1
plt.show()

## Gráfico mapa de calor correlación

In [None]:
fig = plt.figure(figsize = [20,15])
sns.heatmap(df.corr(),annot = True, cmap = 'Blues', center = 0)
plt.show()

#### 1. Cambios para la columna  HallwayType

In [None]:
#Campos en HallwayType
df[['HallwayType']].value_counts()

In [None]:
#Cambios en HallwayType: 1 para terraced, 2 para mixed y 3 para corridor
df.loc[df['HallwayType'] == 'terraced','HallwayTypeId'] = 1
df.loc[df['HallwayType'] == 'mixed','HallwayTypeId'] = 2
df.loc[df['HallwayType'] == 'corridor','HallwayTypeId'] = 3
df[['HallwayTypeId']].value_counts()

#### 2. Cambios para la columna  HeatingType

In [None]:
#Campos en HeatingType
df[['HeatingType']].value_counts()

In [None]:
#Cambios en HeatingType: 1 para individual_heating, 2 para central_heating
df.loc[df['HeatingType'] == 'individual_heating','HeatingTypeId'] = 1
df.loc[df['HeatingType'] == 'central_heating','HeatingTypeId'] = 2
df[['HeatingTypeId']].value_counts()

#### 3. Cambios para la columna AptManageType

In [None]:
#Campos en AptManageType
df[['AptManageType']].value_counts()

In [None]:
#Cambios en AptManageType: 1 para management_in_trust, 2 para self_management
df.loc[df['AptManageType'] == 'management_in_trust','AptManageTypeId'] = 1
df.loc[df['AptManageType'] == 'self_management','AptManageTypeId'] = 2
df[['AptManageTypeId']].value_counts()

#### 4. Cambios para la columna TimeToBusStop

In [None]:
#Campos en TimeToBusStop
df[['TimeToBusStop']].value_counts()

In [None]:
#Cambios en TimeToBusStop: 1 para 0~5min, 2 para 5min~10min y 3 para 10min~15min
df.loc[df['TimeToBusStop'] == '0~5min','TimeToBusStopId'] = 1
df.loc[df['TimeToBusStop'] == '5min~10min','TimeToBusStopId'] = 2
df.loc[df['TimeToBusStop'] == '10min~15min','TimeToBusStopId'] = 3
df[['TimeToBusStopId']].value_counts()

#### 5. Cambios para la columna TimeToSubway

In [None]:
#Campos en TimeToSubway
df[['TimeToSubway']].value_counts()

In [None]:
#Cambios en TimeToBusStop: 1 para 0-5min, 2 para 5min~10min, 3 para 10min~15min, 4 para 15min~20min, 5 para no_bus_stop_nearby
df.loc[df['TimeToSubway'] == '0-5min','TimeToSubwayId'] = 1
df.loc[df['TimeToSubway'] == '5min~10min','TimeToSubwayId'] = 2
df.loc[df['TimeToSubway'] == '10min~15min','TimeToSubwayId'] = 3
df.loc[df['TimeToSubway'] == '15min~20min','TimeToSubwayId'] = 4
df.loc[df['TimeToSubway'] == 'no_bus_stop_nearby','TimeToSubwayId'] = 5
df[['TimeToSubwayId']].value_counts()

#### 6. Cambios para la columna SubwayStation

In [None]:
#Campos en SubwayStation
df[['SubwayStation']].value_counts()

In [None]:
#Cambios en TimeToBusStop: 
#1 para Kyungbuk_uni_hospital, 
#2 para Myung-duk, 
#3 para Banwoldang, 
#4 para Bangoge, 
#5 para Sin-nam,
#6 para no_subway_nearby,
#7 para Chil-sung-market,
#8 para Daegu
df.loc[df['SubwayStation'] == 'Kyungbuk_uni_hospital','SubwayStationId'] = 1
df.loc[df['SubwayStation'] == 'Myung-duk','SubwayStationId'] = 2
df.loc[df['SubwayStation'] == 'Banwoldang','SubwayStationId'] = 3
df.loc[df['SubwayStation'] == 'Bangoge','SubwayStationId'] = 4
df.loc[df['SubwayStation'] == 'Sin-nam','SubwayStationId'] = 5
df.loc[df['SubwayStation'] == 'no_subway_nearby','SubwayStationId'] = 6
df.loc[df['SubwayStation'] == 'Chil-sung-market','SubwayStationId'] = 7
df.loc[df['SubwayStation'] == 'Daegu','SubwayStationId'] = 8

df[['SubwayStationId']].value_counts()

In [None]:
df.describe()

# Parte 2: Árbol de decisión regresor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree

## Creación de variables dummy

In [None]:
df = pd.get_dummies(df)
df.isnull().sum()

### Separación de datos a usar en el árbol

In [None]:
X = df.loc[:,[
    'YearBuilt',
    'YrSold',
    'MonthSold',
    'Size(sqf)',
    'Floor',
    'HallwayTypeId',
    'HeatingTypeId',
    'AptManageTypeId',
    'N_Parkinglot(Ground)',
    'N_Parkinglot(Basement)',
    'TimeToBusStopId',
    'TimeToSubwayId',
    'N_APT',
    'N_manager',
    'N_elevators',
    'SubwayStationId',
    'N_FacilitiesNearBy(PublicOffice)',
    'N_FacilitiesNearBy(Hospital)',
    'N_FacilitiesNearBy(Dpartmentstore)',
    'N_FacilitiesNearBy(Mall)',
    'N_FacilitiesNearBy(ETC)',
    'N_FacilitiesNearBy(Park)',
    'N_SchoolNearBy(Elementary)',
    'N_SchoolNearBy(Middle)',
    'N_SchoolNearBy(High)',
    'N_SchoolNearBy(University)',
    'N_FacilitiesInApt',
    'N_FacilitiesNearBy(Total)',
    'N_SchoolNearBy(Total)'
]].values

y = df.loc[:,['SalePrice']].values


## División de datos de entrenamiento y prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

## Creación y entrenamiento de modelo de árbol de decisión

In [None]:
np.random.seed(310589)
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train.reshape(-1))

In [None]:
y_pred = regressor.predict(X_test)
y_pred

In [None]:
print("Mean Square Error: ", mean_squared_error(y_test, y_pred))
print("Root of Mean Square Error: ", mean_squared_error(y_test, y_pred, squared=False))
print("score: ", regressor.score(X_test, y_test.reshape(-1)))

## Hiperparámetros

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
grid_parametros = {
    'max_depth' : range(4,20),
    'min_samples_leaf' : range(10,200,10),
    'min_samples_split' : range(10,200,10)
}

n_folds = 5

In [None]:
np.random.seed(310589)
regressor = DecisionTreeRegressor()
grid = GridSearchCV(regressor, grid_parametros, cv = n_folds, n_jobs = -1, return_train_score=True)

In [None]:
np.random.seed(310589)
grid.fit(X_train, y_train)

In [None]:
cv_result = pd.DataFrame(grid.cv_results_)
cv_result.shape

In [None]:
cv_result.head()

In [None]:
grid.best_params_

In [None]:
best_grid = grid.best_estimator_
best_grid

In [None]:
best_grid.fit(X_train, y_train)

## Testeo de los hiperparámetros

In [None]:
y_predicts_hyperparam = best_grid.predict(X_test)

In [None]:
print("Mean Square Error: ", mean_squared_error(y_test, y_predicts_hyperparam))
print("Root of Mean Square Error: ", mean_squared_error(y_test, y_predicts_hyperparam, squared=False))
print("score: ", grid.best_score_)

# Parte 3: Modelo de regresión lineal

## Estandarizar variables

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
Xregresion = df.drop(['SalePrice', 'HallwayTypeId', 'HeatingTypeId', 'AptManageTypeId','TimeToBusStopId','TimeToSubwayId','SubwayStationId'], axis=1)
Yregresion = df['SalePrice']
numVars = Xregresion.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
scaler.fit(Xregresion[numVars])
XRegresionScaled = scaler.transform(Xregresion[numVars])

## Variables Dummy

In [None]:
Xregresion = pd.get_dummies(Xregresion)
Xregresion.tail(10)

In [None]:
Xregresion.columns.tolist()

## Conjunto de entrenamiento y prueba

In [None]:
np.random.seed(310589)
X_train, X_test, Y_train, Y_test = train_test_split(XRegresionScaled, Yregresion, test_size = 0.15)

## Entrenamiento modelo de regresión lineal

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)

In [None]:
Y_pred = lr_model.predict(X_test)
Y_pred

In [None]:
print("Mean Square Error: ", mean_squared_error(Y_pred, Y_test))
print("Root of Mean Square Error: ", np.sqrt(mean_squared_error(Y_pred, Y_test)))
print("score: ", r2_score(Y_pred, Y_test))

# Parte 4: Resultados Modelos

## Resultados árbol de regresión

### Antes de hiperparámetros
Mean Square Error:  404265115.1832344

Root of Mean Square Error:  20106.34514732189

score:  0.965272618969355

### Después de hiperparámetros

Mean Square Error:  365855503.05578005

Root of Mean Square Error:  19127.349608761273

score:  0.9663048134101091

## Resultados regresión lineal

Mean Square Error:  1719106175.174076

Root of Mean Square Error:  41462.105291145985

score:  0.8223594619684382

#### Teniendo en cuenta los resultados obtenidos, podemos definir que el modelo que mejor se adapta es el árbol de regresión, con un score de 0.966 y un Root of Mean Square Error de 19.127. 
#### La regresión lineal cuenta con un score de 0.822 y un Root of Mean Square Error que dobla los dos anteriores.
