In [382]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [383]:
df = pd.read_csv('Alquileres_to_ML.csv', index_col=0)

scaler = StandardScaler()

# Ajustar y transformar los datos de la columna
column_standardized = scaler.fit_transform(df['Metros cuadrados'].values.reshape(-1, 1))

# Convertir el resultado en un DataFrame
df['Metros cuadrados'] = column_standardized.astype(float)


In [384]:
#Solo quiero las columnas binarias
lista_col_bin = []
for i in df.columns:
    if (str(df[str(i)].dtype) == 'int64') or (str(df[str(i)].dtype) == 'float64'):
        lista_col_bin.append(i)

df_new = df[lista_col_bin].drop('Precio Antes', axis=1)


In [385]:
df_new

Unnamed: 0,Precio,Habitaciones,Baños,Metros cuadrados,Aire acondicionado,Amueblado,Armarios empotrados,Ascensor,Balcón,Calefacción,Terraza,Planta,Interior / Exterior,Sistema de calefacción
0,1485,1,1,-0.628410,0,1,0,1,0,1,1,7,0,5
1,1425,1,1,-0.628410,1,1,0,1,0,1,0,3,1,5
2,2190,2,2,-0.146401,1,1,0,1,0,1,0,8,1,5
3,3200,3,2,0.335608,1,1,1,1,1,1,1,1,1,2
4,3500,3,2,0.740495,1,1,1,1,1,1,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4455,1840,2,1,-0.435606,1,1,1,1,0,1,0,0,1,2
4456,450,4,2,-0.242803,0,1,0,1,0,0,0,0,1,0
4457,900,2,1,-0.879055,0,0,0,0,0,0,0,0,0,0
4458,800,2,1,-0.956176,0,0,0,0,0,0,0,0,0,0


In [386]:
#Splitting data

X = df_new.drop('Precio', axis=1)
y = df_new[['Precio']]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.25,
                                                   random_state=40)

----

## Regresión Lineal

In [387]:
model = LinearRegression()
model.fit(X_train, y_train)
print(model.intercept_, model.coef_)

[1382.29169989] [[-206.71831588  413.11797908  905.77230877  277.08382555  570.05650976
   153.1213659   135.29161686   41.26104737  103.72988732  236.3058833
    33.59646726 -136.98936598  -83.93222088]]


In [388]:
print("MSE:", mean_squared_error(y_test, model.predict(X_test)))
print("RMSE:", np.sqrt(mean_squared_error(y_test, model.predict(X_test))))

MSE: 957577.9930070435
RMSE: 978.5591412924634


Mucho error, de todos modos vamos a ver algunas predicciones

In [389]:
df_predict = pd.DataFrame()
df_predict['Precio Real'] = y_test['Precio'].tolist()
df_predict['Precio Predicho'] = model.predict(X_test).tolist()
df_predict['Precio Predicho'] = [int(float(i[0])) for i in df_predict['Precio Predicho']]

In [390]:
df_predict

Unnamed: 0,Precio Real,Precio Predicho
0,1700,1481
1,3300,6145
2,1200,1785
3,1295,1681
4,2100,2007
...,...,...
1110,1750,1851
1111,1850,1706
1112,1480,2102
1113,1600,867


In [391]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print('R2:', r2)

R2: 0.5464261081855759


---

In [393]:
import pandas as pd
from catboost import CatBoostRegressor

In [394]:
# Crear el objeto del modelo CatBoostRegressor
model = CatBoostRegressor(iterations=1000, learning_rate=0.031, depth=5,)

# Entrenar el modelo
model.fit(X_train, y_train, verbose=100)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)


0:	learn: 1524.2228067	total: 1.9ms	remaining: 1.9s
100:	learn: 1041.4375089	total: 127ms	remaining: 1.13s


200:	learn: 977.7837272	total: 253ms	remaining: 1s
300:	learn: 944.5861781	total: 374ms	remaining: 869ms
400:	learn: 919.7183125	total: 497ms	remaining: 743ms
500:	learn: 887.5723656	total: 643ms	remaining: 641ms
600:	learn: 862.4856746	total: 773ms	remaining: 513ms
700:	learn: 840.0736631	total: 901ms	remaining: 384ms
800:	learn: 819.6840792	total: 1.02s	remaining: 255ms
900:	learn: 804.4284335	total: 1.13s	remaining: 125ms
999:	learn: 788.5412076	total: 1.24s	remaining: 0us


In [395]:
# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, model.predict(X_test))
print('MSE:', mse)
print("RMSE:", np.sqrt(mean_squared_error(y_test, model.predict(X_test))))


MSE: 810628.2618343846
RMSE: 900.3489666981268


In [396]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print('R2:', r2)

R2: 0.6160314687471318


In [397]:
df_imp = pd.DataFrame()
df_imp['Columnas'] = X_train.columns
df_imp['importances'] = model.feature_importances_
df_imp

Unnamed: 0,Columnas,importances
0,Habitaciones,6.251108
1,Baños,11.488882
2,Metros cuadrados,39.656752
3,Aire acondicionado,3.676403
4,Amueblado,7.550967
5,Armarios empotrados,2.624534
6,Ascensor,2.531615
7,Balcón,2.021545
8,Calefacción,0.988241
9,Terraza,3.205167


----


## XGBoost Regressor

In [398]:
import xgboost
import re

In [399]:
df_pruebas = pd.DataFrame()
lista_para_df = []

In [400]:
model = xgboost.XGBRegressor(n_estimators=150, learning_rate=0.029, max_depth=4)
str(model).split('(')[1].split(',')

lista_pruebas = []
for i in str(model).split('(')[1].split(','):
    pattern = re.compile(r'.+=\d.*')
    matches = pattern.findall(i)

    for match in matches:
        if match != ['']:
            lista_pruebas.append(match)
        else:
            continue

In [401]:
lista_para_df.append(' '.join(lista_pruebas))

In [402]:
lista_para_df

[' learning_rate=0.029  max_depth=4              n_estimators=150']

In [403]:
pd.DataFrame(lista_para_df)

Unnamed: 0,0
0,learning_rate=0.029 max_depth=4 ...


In [404]:
df_pruebas

In [405]:
model = xgboost.XGBRegressor(objective='reg:squarederror', n_estimators=150, learning_rate=0.028, max_depth=4)

# Entrenar el modelo
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

MSE: 866864.0432185398


In [406]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print('R2:', r2)

R2: 0.5893943881041884


In [407]:
df_predict = pd.DataFrame()
df_predict['Precio Real'] = y_test['Precio'].tolist()
df_predict['Precio Predicho'] = model.predict(X_test).tolist()
df_predict['Precio Predicho'] = [int(float(i)) for i in df_predict['Precio Predicho']]

In [408]:
df_predict

Unnamed: 0,Precio Real,Precio Predicho
0,1700,1799
1,3300,5405
2,1200,1593
3,1295,1597
4,2100,1898
...,...,...
1110,1750,1873
1111,1850,2068
1112,1480,2004
1113,1600,1069


In [409]:
df_imp = pd.DataFrame()
df_imp['Columnas'] = X_train.columns
df_imp['importances'] = model.feature_importances_
df_imp

Unnamed: 0,Columnas,importances
0,Habitaciones,0.03169
1,Baños,0.176226
2,Metros cuadrados,0.270188
3,Aire acondicionado,0.083946
4,Amueblado,0.07046
5,Armarios empotrados,0.061368
6,Ascensor,0.053373
7,Balcón,0.027926
8,Calefacción,0.054515
9,Terraza,0.056078


---


## Random Forest Regressor

In [410]:
from sklearn.ensemble import RandomForestRegressor

In [411]:
model = RandomForestRegressor(n_estimators=190, random_state=42, max_depth=8)

# Entrenar el modelo
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Calcular el error cuadrático medio (MSE)
mse = mean_squared_error(y_test, y_pred)
print('MSE:', mse)

r2 = r2_score(y_test, model.predict(X_test))
print('R2:', r2)

  model.fit(X_train, y_train)


MSE: 872463.2496382421
R2: 0.5867422241390565


In [412]:
df_imp = pd.DataFrame()
df_imp['Columnas'] = X_train.columns
df_imp['importances'] = model.feature_importances_


In [413]:
df_imp

Unnamed: 0,Columnas,importances
0,Habitaciones,0.044679
1,Baños,0.095164
2,Metros cuadrados,0.588274
3,Aire acondicionado,0.01896
4,Amueblado,0.046759
5,Armarios empotrados,0.013306
6,Ascensor,0.008106
7,Balcón,0.013967
8,Calefacción,0.008712
9,Terraza,0.021125
