In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('mode.chained_assignment', None) # Deshabilita SettingWithCopyWarning. Ojo.

df = pd.read_csv('/kaggle/input/tp2grupo3/train.csv', index_col='id', parse_dates=['fecha'])
# Agrego dos columnas calculadas a partir de la fecha (timestamp y 201602,...)
df['ts'] = df.fecha.astype(int)
df['anio_mes'] = df['fecha'].map(lambda x: 100 * x.year + x.month)


In [None]:
# Métrica de evaluación
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

In [None]:
#------------------------------DUMMY REGRESSOR------------------------------
#-------------PREPROCESAMIENTO-------------
drop_cols = ['fecha', 'ciudad', 'idzona', 'tipodepropiedad', 'provincia', 'titulo', 'descripcion', 'direccion']
X = df.drop(['precio'] + drop_cols, axis=1)
y = df['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#-------------MODELO-------------
dummy = DummyRegressor(strategy='mean').fit(X_train, y_train)
pred = dummy.predict(X_test)
print(f"Promedio de precios del train set: {y_train.mean()}")
print(f"Primeras 3 predicciones: {pred[:3]}")

dummy_rmsle = RMSLE(y_test, pred)
dummy_rmsle_train = RMSLE(y_train, dummy.predict(X_train))
print(f"RMSLE DummyRegressor (train): {dummy_rmsle_train:.5f}")
print(f"RMSLE DummyRegressor: {dummy_rmsle:.5f}")

In [None]:
#------------------------------REGRESION LINEAL SOBRE METROS CUBIERTOS------------------------------
#-------------PREPROCESAMIENTO-------------
X = df[['metroscubiertos']]
y = df['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

imp = Imputer()
X_train['metroscubiertos'] = imp.fit_transform(X_train[['metroscubiertos']])
X_test['metroscubiertos'] = imp.transform(X_test[['metroscubiertos']])

#-------------MODELO-------------
linear_model = LinearRegression().fit(X_train, y_train)
pred = linear_model.predict(X_test)

# El modelo de regresion lineal con una variable obtiene RMSLE=0.65
linear_rmsle_train = RMSLE(y_train, linear_model.predict(X_train))
linear_rmsle = RMSLE(y_test, pred)
print(f"RMSLE LinearRegression (train): {linear_rmsle_train:.5f}")
print(f"RMSLE LinearRegression: {linear_rmsle:.5f}")

In [None]:
#------------------------------ARBOL DE DECISION------------------------------
#-------------PREPROCESAMIENTO-------------
drop_cols = ['titulo', 'descripcion', 'direccion', 'lat', 'lng', 'fecha', 'ts', 'idzona']
df2 = df.drop(drop_cols, axis=1).copy()
print(f"Columnas ({len(df2.columns)}): {df2.columns.tolist()}")

display(df2.isnull().sum())
numeric_columns_with_nulls = list(set(df2.columns[df2.isnull().sum() > 0].tolist()) \
                                  - set(['tipodepropiedad', 'ciudad', 'provincia']))

#One hot encoding
df2 = pd.get_dummies(df2, dummy_na=True)

# Para los nulls numéricos, usar un Imputer con strategy mean (reemplazamos los NaN por el promedio)
# Para no leakear, spliteamos el dataset antes
X2 = df2.drop("precio", axis=1)
y2 = df2['precio']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, random_state=1)

for c in numeric_columns_with_nulls:
    imp = Imputer()
    X2_train[c] = imp.fit_transform(X2_train[[c]])
    X2_test[c] = imp.transform(X2_test[[c]])
    
#-------------MODELO-------------
tree = DecisionTreeRegressor().fit(X2_train, y2_train)
tree_pred = tree.predict(X2_test)

tree_rmsle = RMSLE(y2_test, tree_pred)
tree_rmsle_train = RMSLE(y2_train, tree.predict(X2_train))
print(f"RMSLE DecisionTreeRegressor (train): {tree_rmsle_train:.5f}")
print(f"RMSLE DecisionTreeRegressor: {tree_rmsle:.5f}")

print(f"RMSLE DummyRegressor       : {dummy_rmsle:.5f}")
print(f"RMSLE LinearRegressor      : {linear_rmsle:.5f}")
print(f"RMSLE DecisionTreeRegressor: {tree_rmsle:.5f}")


In [50]:
#Sumbit
df_train = pd.read_csv("/kaggle/input/tp2grupo3/train.csv", index_col=0)
df_submit = pd.read_csv("/kaggle/input/tp2grupo3/test.csv", index_col=0)

# Imputamos los NaNs
imp = Imputer()
df_train['metroscubiertos'] = imp.fit_transform(df_train[['metroscubiertos']])
df_submit['metroscubiertos'] = imp.transform(df_submit[['metroscubiertos']])

linear_pred = LinearRegression()\
                    .fit(df_train[['metroscubiertos']], df_train['precio'])\
                    .predict(df_submit[['metroscubiertos']])

res = pd.DataFrame(linear_pred, index=df_submit.index, columns=['precio'])
display(res.head())
#res = res.reset_index(drop=True)
#res.to_csv("/kaggle/working/submission_linear_grupo3.csv", header=True) # RMSLE=0.65487



Unnamed: 0_level_0,precio
id,Unnamed: 1_level_1
4941,4200756.25
51775,1112322.29
115253,1377423.92
299321,1364168.84
173570,1284638.35


In [49]:

res = res.reset_index(drop=False)
res.columns = ['id', 'target']
res.set_index('id', inplace = True)
display(res.head())
res.to_csv("/kaggle/working/submission_linear_grupo3.csv", header=True) # RMSLE=0.65487

ValueError: Length mismatch: Expected axis has 3 elements, new values have 2 elements