# Explore here

In [2]:
# 0) Imports y paths
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

RAW = Path("../data/raw/internal-link.csv")

# 1) Carga
df = pd.read_csv(RAW)

# 2) Vista rápida
df.head()



Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
# Eliminar duplicados
df.drop("id", axis =1).duplicated().sum()

np.int64(0)

In [4]:
# Eliminar variables "inútiles"
df.drop(["id", "name","host_name", "latitude", "longitude"], axis = 1, inplace = True)
df.head()

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,Private room,149,1,9,2018-10-19,0.21,6,365
1,2845,Manhattan,Midtown,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,4632,Manhattan,Harlem,Private room,150,3,0,,,1,365
3,4869,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,7192,Manhattan,East Harlem,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [2]:
# --- Helpers
def top_count(df, col, n=20):
    """Devuelve un DataFrame con Top-N categorías por frecuencia."""
    s = df[col].value_counts().head(n)
    out = s.reset_index()
    out.columns = [col, "count"]
    return out

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
(ax00, ax01), (ax10, ax11) = axes

# 1) host_id (Top 20 hosts con más listings)
top_hosts = top_count(df, "host_id", n=20)
sns.barplot(data=top_hosts, y="host_id", x="count", ax=ax00)
ax00.set_title("Top 20 host_id por nº de listings")
ax00.set_xlabel("Listings")
ax00.set_ylabel("host_id")

# 2) neighbourhood_group (son 5 categorías -> countplot directo)
sns.countplot(data=df, x="neighbourhood_group", ax=ax01, order=df["neighbourhood_group"].value_counts().index)
ax01.set_title("neighbourhood_group")
ax01.set_xlabel("")
ax01.tick_params(axis="x", rotation=15)

# 3) neighbourhood (Top 20 barrios)
top_neigh = top_count(df, "neighbourhood", n=20)
sns.barplot(data=top_neigh, y="neighbourhood", x="count", ax=ax10)
ax10.set_title("Top 20 neighbourhood")
ax10.set_xlabel("Listings")
ax10.set_ylabel("neighbourhood")

# 4) room_type (3–4 categorías -> countplot directo)
sns.countplot(data=df, x="room_type", ax=ax11, order=df["room_type"].value_counts().index)
ax11.set_title("room_type")
ax11.set_xlabel("")
ax11.tick_params(axis="x", rotation=15)

plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [14]:
# 3) Duplicados (por id)
if "id" in df.columns:
    print("Duplicados por id:", df["id"].duplicated().sum())

# 4) Descriptivas de numéricas
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
display(df[num_cols].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T)

# 5) Sanity checks típicos
assert (df["price"]>=0).all(), "Hay precios negativos"
print("Precios = 0:", (df["price"]==0).sum())
print("minimum_nights <=0:", (df["minimum_nights"]<=0).sum())

# 6) Categóricas clave
cat_cols = ["neighbourhood_group","neighbourhood","room_type"]
for c in cat_cols:
    if c in df.columns:
        display(df[c].value_counts(dropna=False).to_frame("count"))


Duplicados por id: 0


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
id,48895.0,19017140.0,10983110.0,2539.0,171715.1,1222383.0,9471945.0,19677280.0,29152180.0,35259100.0,36238880.0,36487240.0
host_id,48895.0,67620010.0,78610970.0,2438.0,100692.2,815564.1,7822033.0,30793820.0,107434400.0,241764600.0,267711800.0,274321300.0
latitude,48895.0,40.72895,0.05453008,40.49979,40.596687,40.64611,40.6901,40.72307,40.76311,40.82564,40.86466,40.91306
longitude,48895.0,-73.95217,0.04615674,-74.24442,-74.026774,-74.00388,-73.98307,-73.95568,-73.93627,-73.86577,-73.77692,-73.71299
price,48895.0,152.7207,240.1542,0.0,30.0,40.0,69.0,106.0,175.0,355.0,799.0,10000.0
minimum_nights,48895.0,7.029962,20.51055,1.0,1.0,1.0,1.0,3.0,5.0,30.0,45.0,1250.0
number_of_reviews,48895.0,23.27447,44.55058,0.0,0.0,0.0,1.0,5.0,24.0,114.0,214.0,629.0
reviews_per_month,38843.0,1.373221,1.680442,0.01,0.02,0.04,0.19,0.72,2.02,4.64,7.1958,58.5
calculated_host_listings_count,48895.0,7.143982,32.95252,1.0,1.0,1.0,1.0,1.0,2.0,15.0,232.0,327.0
availability_365,48895.0,112.7813,131.6223,0.0,0.0,0.0,0.0,45.0,227.0,359.0,365.0,365.0


Precios = 0: 11
minimum_nights <=0: 0


Unnamed: 0_level_0,count
neighbourhood_group,Unnamed: 1_level_1
Manhattan,21661
Brooklyn,20104
Queens,5666
Bronx,1091
Staten Island,373


Unnamed: 0_level_0,count
neighbourhood,Unnamed: 1_level_1
Williamsburg,3920
Bedford-Stuyvesant,3714
Harlem,2658
Bushwick,2465
Upper West Side,1971
...,...
Richmondtown,1
Fort Wadsworth,1
New Dorp,1
Rossville,1


Unnamed: 0_level_0,count
room_type,Unnamed: 1_level_1
Entire home/apt,25409
Private room,22326
Shared room,1160
