#  🏙️ New York Rental Prediction

## 📦 Imports

In [19]:
import pandas as pd
import numpy as np 

from sklearn.impute import KNNImputer




## 📂 Load the Dataset

In [3]:
data = pd.read_csv("data/database.csv")

In [4]:
data.head()

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


## 🗑️ Drop Irrelevant Columns

In [13]:
data = data.drop(['id', 'nome', 'host_id', 'host_name'], axis=1)

## 📊 Explore the Data

In this section, we will analyze the distribution of three key variables: `bairro_group`, `bairro`, and `room_type`. 

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             48894 non-null  int64  
 1   nome                           48878 non-null  object 
 2   host_id                        48894 non-null  int64  
 3   host_name                      48873 non-null  object 
 4   bairro_group                   48894 non-null  object 
 5   bairro                         48894 non-null  object 
 6   latitude                       48894 non-null  float64
 7   longitude                      48894 non-null  float64
 8   room_type                      48894 non-null  object 
 9   price                          48894 non-null  int64  
 10  minimo_noites                  48894 non-null  int64  
 11  numero_de_reviews              48894 non-null  int64  
 12  ultima_review                  38842 non-null 

In [8]:
bairro_group_counts = data['bairro_group'].value_counts()
bairro_group_counts

Manhattan        21661
Brooklyn         20103
Queens            5666
Bronx             1091
Staten Island      373
Name: bairro_group, dtype: int64

In [9]:
bairro_counts = data['bairro'].value_counts()
bairro_counts

Williamsburg          3920
Bedford-Stuyvesant    3714
Harlem                2658
Bushwick              2465
Upper West Side       1971
                      ... 
Fort Wadsworth           1
Richmondtown             1
New Dorp                 1
Rossville                1
Willowbrook              1
Name: bairro, Length: 221, dtype: int64

In [10]:
room_type_counts = data['room_type'].value_counts()
room_type_counts

Entire home/apt    25409
Private room       22325
Shared room         1160
Name: room_type, dtype: int64

## 🧹 Missing Values

In [14]:
data.isna().sum()

bairro_group                         0
bairro                               0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimo_noites                        0
numero_de_reviews                    0
ultima_review                    10052
reviews_por_mes                  10052
calculado_host_listings_count        0
disponibilidade_365                  0
dtype: int64

In [16]:
data[data.isna().any(axis=1)]

Unnamed: 0,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
1,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
18,Manhattan,East Harlem,40.79685,-73.94872,Entire home/apt,190,7,0,,,2,249
25,Manhattan,Inwood,40.86754,-73.92639,Private room,80,4,0,,,1,0
35,Brooklyn,Bedford-Stuyvesant,40.68876,-73.94312,Private room,35,60,0,,,1,365
37,Brooklyn,Flatbush,40.63702,-73.96327,Private room,150,1,0,,,1,365
...,...,...,...,...,...,...,...,...,...,...,...,...
48889,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48890,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48891,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48892,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   bairro_group                   48894 non-null  object 
 1   bairro                         48894 non-null  object 
 2   latitude                       48894 non-null  float64
 3   longitude                      48894 non-null  float64
 4   room_type                      48894 non-null  object 
 5   price                          48894 non-null  int64  
 6   minimo_noites                  48894 non-null  int64  
 7   numero_de_reviews              48894 non-null  int64  
 8   ultima_review                  38842 non-null  object 
 9   reviews_por_mes                38842 non-null  float64
 10  calculado_host_listings_count  48894 non-null  int64  
 11  disponibilidade_365            48894 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usag

### 🔍 KNN Imputer: Filling Missing Values

In [20]:
knn_imputer = KNNImputer(n_neighbors=5)
data['reviews_por_mes'] = knn_imputer.fit_transform(data[['reviews_por_mes']])

In [21]:
data.head()

Unnamed: 0,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1.373251,1,365
2,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
