In [471]:
import pandas as pd

df = pd.read_csv('./housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [472]:
datos = df.dropna()

In [473]:
# Aquí intenté eliminar las columnas 'longitude' y 'latitude' pero no funcionó, 
# entonces deduje que el modelo lo que necesita son más datos para poder predecir

# datos.drop(['longitude', 'latitude'], axis=1, inplace=True)
# datos.drop(['longitude'], axis=1, inplace=True)

# Aquí intenté eliminar las filas que contienen 'INLAND' en la columna 'ocean_proximity',
# pero bajaba bastante la precisión del modelo

# datos = datos[datos['ocean_proximity'] != 'INLAND']

In [474]:
# datos.hist(bins=50, figsize=(20,15), edgecolor='black')

In [475]:
datos['median_income'].value_counts()

median_income
3.1250     49
15.0001    48
2.8750     46
4.1250     44
2.6250     44
           ..
1.8413      1
2.0800      1
1.3375      1
1.5085      1
1.5694      1
Name: count, Length: 12825, dtype: int64

In [476]:
dummies = pd.get_dummies(datos['ocean_proximity'], dtype=int)
datos = pd.concat([datos, dummies], axis=1)

datos.drop('ocean_proximity', axis=1, inplace=True)
datos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


In [477]:
datos.corr()['median_house_value'].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688355
<1H OCEAN             0.257614
NEAR BAY              0.160526
NEAR OCEAN            0.140378
total_rooms           0.133294
housing_median_age    0.106432
households            0.064894
total_bedrooms        0.049686
ISLAND                0.023525
population           -0.025300
longitude            -0.045398
latitude             -0.144638
INLAND               -0.484787
Name: median_house_value, dtype: float64

In [478]:
# Aquí quiero evaluar el impacto de las relaciones entre las columnas, en el modelo
datos['rooms_per_household'] = datos['total_rooms'] / datos['households']
datos['bedrooms_per_household'] = datos['total_bedrooms'] / datos['households']
datos['population_per_household'] = datos['population'] / datos['households']
datos['bedrooms_per_room'] = datos['total_bedrooms'] / datos['total_rooms']
datos['room_ratio'] = datos['total_rooms'] / datos['total_bedrooms']
datos['income_per_person'] = datos['median_income'] / datos['population']
datos['age_per_person'] = datos['housing_median_age'] / datos['population']
datos['income_per_household'] = datos['median_income'] / datos['households']
datos['income_per_room'] = datos['median_income'] / datos['total_rooms']
datos['income_per_bedroom'] = datos['median_income'] / datos['total_bedrooms']
datos['age_per_household'] = datos['housing_median_age'] / datos['households']
datos['age_per_room'] = datos['housing_median_age'] / datos['total_rooms']
datos['age_per_bedroom'] = datos['housing_median_age'] / datos['total_bedrooms']
datos['population_per_room'] = datos['population'] / datos['total_rooms']
datos['population_per_bedroom'] = datos['population'] / datos['total_bedrooms']
datos['population_per_income'] = datos['population'] / datos['median_income']


# De la información que se dio en el ejercicio, agregué estas tres columnas
datos['casa_vieja'] = (datos['housing_median_age'] > 50).astype(int)
datos['casa_cara'] = (datos['median_house_value'] > 500000).astype(int)
datos['ingreso_maximo'] = (datos['median_income'] < 15).astype(int)

datos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,...,income_per_bedroom,age_per_household,age_per_room,age_per_bedroom,population_per_room,population_per_bedroom,population_per_income,casa_vieja,casa_cara,ingreso_maximo
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,...,0.064536,0.325397,0.046591,0.317829,0.365909,2.496124,38.677749,0,0,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,...,0.007506,0.018453,0.002958,0.018987,0.338217,2.170886,289.228323,0,0,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,...,0.038197,0.293785,0.035446,0.273684,0.338105,2.610526,68.344035,1,0,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,...,0.024013,0.237443,0.040816,0.221277,0.437991,2.374468,98.881820,1,0,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,...,0.013736,0.200772,0.031961,0.185714,0.347265,2.017857,146.898237,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,...,0.004172,0.075758,0.015015,0.066845,0.507508,2.259358,541.562520,0,0,1
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,...,0.017045,0.157895,0.025825,0.120000,0.510760,2.373333,139.236546,0,0,1
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,...,0.003505,0.039261,0.007542,0.035052,0.446761,2.076289,592.352941,0,0,1
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,...,0.004565,0.051576,0.009677,0.044010,0.398387,1.811736,396.850900,0,0,1


In [479]:
X = datos.drop('median_house_value', axis=1)
y = datos['median_house_value']

In [480]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [481]:
from sklearn.linear_model import LinearRegression

modelo = LinearRegression()

modelo.fit(X_train, y_train)

In [482]:
predicciones = modelo.predict(X_test)

In [483]:
comparativa = {"predicciones":predicciones, "valor real": y_test}
result = pd.DataFrame(comparativa)
result

Unnamed: 0,predicciones,valor real
14416,206615.895596,245800.0
16383,151834.279526,137900.0
7731,199365.663661,218200.0
1410,171178.500737,220800.0
1335,192411.066494,170500.0
...,...,...
8291,400444.255137,500001.0
6274,175204.757005,157900.0
2997,116581.968983,100200.0
13440,130603.742431,127700.0


In [484]:
modelo.score(X_test, y_test)

0.708048226747596

In [485]:
modelo.score(X_train, y_train)

0.7207762865858584

In [None]:
# ¿El resultado fue mejor o peor?
#
# El resultado fue mejor que al de la clase. En clase el modelo dio un 65% de precisión,
# y en este caso y tras agregar nuevas características, el modelo dio un 72% de precisión,
# aún así no se llegó al 80% de precisión que se pide en el ejercicio.


# ¿Por qué crees que es así? Por qué son necesarios los cambios aplicados
# (fundamento del porqué afecta esos cambios)
#
# El principal motivo y por lo que subió mayormente la precisión del modelo, es debido a los
# tres incisos que se especificaron en el ejercicio, ya que al agregar las columnas con la información
# si una casa es vieja, si una casa es cara y si el ingreso es máximo, el modelo pudo predecir de mejor
# manera los datos. Además, al agregar más columnas con las relaciones entre las columnas ya existentes,
# se le añadió un extra de precisión al modelo, ya que se le dio más información para poder predecir.