In [74]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


In [75]:
df = pd.read_csv('listings_limpio_2.csv', index_col=0)
df.head()

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
0,Chamartín,Private room,3.021213,1,81,0.57,1,149,3
1,Latina,Private room,2.661819,4,33,0.38,2,131,0
2,Arganzuela,Entire home/apt,2.932182,15,0,1.347131,6,311,0
3,Centro,Entire home/apt,3.249172,5,14,0.16,1,167,4
4,Arganzuela,Private room,2.62248,2,154,1.09,1,344,5


# EJERCICIO 1

Estandarizar las columnas de las variables predictoras

 - Variable respuesta: price
 - Variable predictora: availability_365, minimum_nights, number_of_reviews, reviews_per_month, number_of_reviews_ltm, 'Private room', 'Entire home/apt', 'Shared room', 'Hotel room'

In [76]:
df.dtypes

neighbourhood_group                object
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
dtype: object

 - Hacemos encoding de room_type

In [77]:
habitaciones = df['room_type'].values.reshape(-1,1)

In [78]:
barrios = df['neighbourhood_group'].values.reshape(-1,1)

In [79]:
df['neighbourhood_group'].unique()

array(['Chamartín', 'Latina', 'Arganzuela', 'Centro', 'Salamanca',
       'Fuencarral - El Pardo', 'Ciudad Lineal', 'Chamberí', 'Villaverde',
       'Hortaleza', 'Carabanchel', 'Retiro', 'Tetuán',
       'San Blas - Canillejas', 'Barajas', 'Usera', 'Puente de Vallecas',
       'Villa de Vallecas', 'Moncloa - Aravaca', 'Moratalaz', 'Vicálvaro'],
      dtype=object)

In [80]:
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room', 'Hotel room'],
      dtype=object)

In [81]:
enc = OneHotEncoder()
trans = enc.fit_transform(barrios)
df[['Chamartín', 'Latina', 'Arganzuela', 'Centro', 'Salamanca',
       'Fuencarral_El_Pardo', 'Ciudad_Lineal', 'Chamberí', 'Villaverde',
       'Hortaleza', 'Carabanchel', 'Retiro', 'Tetuán',
       'San_Blas_Canillejas', 'Barajas', 'Usera', 'Puente_de_Vallecas',
       'Villa_de_Vallecas', 'Moncloa_Aravaca', 'Moratalaz', 'Vicálvaro']] = pd.DataFrame(trans.toarray())

In [82]:
enc = OneHotEncoder()
trans = enc.fit_transform(habitaciones)
df[['private_room', 'entire_home', 'shared_room', 'hotel_room']] = pd.DataFrame(trans.toarray())

In [83]:
df.head()

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,Chamartín,...,Usera,Puente_de_Vallecas,Villa_de_Vallecas,Moncloa_Aravaca,Moratalaz,Vicálvaro,private_room,entire_home,shared_room,hotel_room
0,Chamartín,Private room,3.021213,1,81,0.57,1,149,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Latina,Private room,2.661819,4,33,0.38,2,131,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Arganzuela,Entire home/apt,2.932182,15,0,1.347131,6,311,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Centro,Entire home/apt,3.249172,5,14,0.16,1,167,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Arganzuela,Private room,2.62248,2,154,1.09,1,344,5,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [84]:
df.neighbourhood_group.unique()

array(['Chamartín', 'Latina', 'Arganzuela', 'Centro', 'Salamanca',
       'Fuencarral - El Pardo', 'Ciudad Lineal', 'Chamberí', 'Villaverde',
       'Hortaleza', 'Carabanchel', 'Retiro', 'Tetuán',
       'San Blas - Canillejas', 'Barajas', 'Usera', 'Puente de Vallecas',
       'Villa de Vallecas', 'Moncloa - Aravaca', 'Moratalaz', 'Vicálvaro'],
      dtype=object)

 - Estandarizamos todas las columnas numéricas

In [85]:
numericas = df.select_dtypes(np.number)

In [86]:
scaler = StandardScaler()
col_escaladas2 = scaler.fit_transform(numericas)

df_escalado = pd.DataFrame(col_escaladas2, columns = numericas.columns)
df_escalado.head(2)

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,Chamartín,Latina,Arganzuela,...,Usera,Puente_de_Vallecas,Villa_de_Vallecas,Moncloa_Aravaca,Moratalaz,Vicálvaro,private_room,entire_home,shared_room,hotel_room
0,-0.167769,-0.17692,0.647995,-0.561126,-0.382089,0.006434,-0.269185,-0.241623,-0.087978,-0.189869,...,-0.154565,-0.212571,-0.121743,-0.064408,-0.067086,-0.09755,-1.243109,-0.0916,1.301434,-0.11621
1,-1.004904,-0.09465,-0.047665,-0.698318,-0.346261,-0.122452,-0.493411,-0.241623,-0.087978,-0.189869,...,-0.154565,-0.212571,-0.121743,-0.064408,-0.067086,-0.09755,-1.243109,-0.0916,1.301434,-0.11621


# EJERCICIO 2

Cread un ANOVA, siendo la variable respuesta price y las variables predictoras el resto de las columnas del dataframe

 - Empezamos el ANOVA

In [87]:
lm = ols('price ~ minimum_nights + number_of_reviews + reviews_per_month + calculated_host_listings_count + availability_365 + number_of_reviews_ltm + private_room	+ entire_home + shared_room	+ hotel_room + Chamartín + Latina + Arganzuela + Centro + Salamanca +Fuencarral_El_Pardo + Ciudad_Lineal + Chamberí + Villaverde + Hortaleza + Carabanchel + Retiro + Tetuán + San_Blas_Canillejas + Barajas + Usera + Puente_de_Vallecas + Villa_de_Vallecas + Moncloa_Aravaca + Moratalaz + Vicálvaro', data=numericas).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
minimum_nights,1.0,11.835021,11.835021,68.478152,1.376111e-16
number_of_reviews,1.0,0.489864,0.489864,2.834381,0.09228453
reviews_per_month,1.0,2.251739,2.251739,13.028698,0.0003076271
calculated_host_listings_count,1.0,54.256361,54.256361,313.930621,1.2679680000000001e-69
availability_365,1.0,121.498391,121.498391,702.997113,7.540687e-152
number_of_reviews_ltm,1.0,0.225954,0.225954,1.307383,0.252885
private_room,1.0,0.332005,0.332005,1.921,0.1657645
entire_home,1.0,1.140051,1.140051,6.596402,0.01022689
shared_room,1.0,0.220566,0.220566,1.27621,0.2586206
hotel_room,1.0,0.033838,0.033838,0.19579,0.6581464
