# EDA

In [79]:
import numpy as np
import pandas as pd
from haversine import haversine, Unit
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

In [68]:
df = pd.read_csv("/workspaces/Emiliano0041-IntroML/data/raw/AB_NYC_2019.csv")
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


### Conocecmos el data set con el que vamos a trabajar
- Para eso nos interesan algunas cosas como el tamaño en filas y columnas y las columnas que tenemos.

In [69]:
df.shape
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

###

- Con df.drop verificamos que no haya ningun alojamiento duplicado.

In [70]:
df.drop("id", axis=1).duplicated().sum()

np.int64(0)

- Con esto nos damos cuenta que hay casi 11,5 mil hostes que administran mas de un alojamiento. Dato que quizas nos pueda servir mas adelante.

In [71]:
df.duplicated("host_id").sum()

np.int64(11438)

- En este punto lo que hacemos es eliminar las columnas que no veo relevantes para el estudio, como lo son el nombre del alojamiento y el nombre del host.


In [72]:
df.drop(["name", "host_name"], axis=1, inplace=True)


- Lo que hago a continuacion es eliminar todos los alojamientos que tengan como precio = 0, ya que pueden ser errores o outliers que me pueden entorpecer el analisis. 

In [None]:
df = df[df["price"] > 0]


Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,distance_time_square,type_room
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,12.337915,2
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,0.508366,1
2,3647,4632,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,6.757250,2
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,8.387046,1
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,5.701504,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,8232441,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,9.330768,2
48891,36485057,6570630,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,7.644604,2
48892,36485431,23492952,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,7.030997,1
48893,36485609,30985759,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,0.476485,3


- Ahora cree una nueva columna, determinando la latitud y longitud del Time square, en la que podemos apreciar la distancia de cada alojamiento a esta atraccion turistica, una de las mas famosas de Nueva York.

- Para esto importe haversine, funcion que nos permite calcular distancias geograficas si tenemos latitud y longitud, hacia un punto de interes.

In [82]:
time_square = (40.7580, -73.9855)
df["distance_time_square"] = df.apply(lambda row : 
                                      haversine((row["latitude"], row["longitude"]), 
                                                time_square, unit=Unit.KILOMETERS), axis=1)

- Ahora con .describe() podemos conocer a fondo alguna de las medidas estadisticas que nos serviran para sacar algunas conclusiones. Dude si redondear algunos resultados y convertir los numeros a enteros para no dejarlos en notacion cientifica, pero quiero ser preciso en mis estadisticas, por eso lo dejo asi.

In [83]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,48884.0,19016790.0,10984320.0,2539.0,9470548.0,19675740.0,29152970.0,36487240.0
host_id,48884.0,67622030.0,78616660.0,2438.0,7817310.0,30792570.0,107434400.0,274321300.0
latitude,48884.0,40.72895,0.05453177,40.49979,40.6901,40.72308,40.76312,40.91306
longitude,48884.0,-73.95218,0.04615874,-74.24442,-73.98308,-73.95569,-73.93629,-73.71299
price,48884.0,152.7551,240.1703,10.0,69.0,106.0,175.0,10000.0
minimum_nights,48884.0,7.029887,20.51222,1.0,1.0,3.0,5.0,1250.0
number_of_reviews,48884.0,23.27199,44.55133,0.0,1.0,5.0,24.0,629.0
reviews_per_month,38833.0,1.373128,1.680391,0.01,0.19,0.72,2.02,58.5
calculated_host_listings_count,48884.0,7.144628,32.95619,1.0,1.0,1.0,2.0,327.0
availability_365,48884.0,112.7795,131.6273,0.0,0.0,45.0,227.0,365.0


- Ahora con .unique(), verificamos cuantos tipos de alojamiento tenemos, para luego convertirlos en datos numericos.

- Luego, con .value_counts() tambien podemos verificar que no hayan errores de tipeo por ejemplo en ninguno de las filas.

In [76]:
df["room_type"].unique()
df["room_type"].value_counts()

room_type
Entire home/apt    25407
Private room       22319
Shared room         1158
Name: count, dtype: int64

- Aqui lo que hago es crear una nueva columna "type_room" en donde le otorgue un valor numerico al tipo de alojamiento. En el cual:

--- Entire home/apt = 1

--- Private room = 2

--- Shared room = 3

- Con esto ya podemos usar el tipo de alojamiento en analisis estadisticos numericos

In [77]:
df["type_room"] = df["room_type"].map({
    "Entire home/apt" : 1,
    "Private room" : 2,
    "Shared room" : 3
})
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["type_room"] = df["room_type"].map({


Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,distance_time_square,type_room
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,12.337915,2
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,0.508366,1
2,3647,4632,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,6.757250,2
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,8.387046,1
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,5.701504,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,8232441,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,9.330768,2
48891,36485057,6570630,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,7.644604,2
48892,36485431,23492952,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,7.030997,1
48893,36485609,30985759,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,0.476485,3
