In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29203 entries, 0 to 29202
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              29203 non-null  int64  
 1   name                            29187 non-null  object 
 2   host_id                         29202 non-null  float64
 3   host_name                       29184 non-null  object 
 4   neighbourhood_group             29202 non-null  object 
 5   neighbourhood                   29202 non-null  object 
 6   latitude                        29202 non-null  float64
 7   longitude                       29202 non-null  float64
 8   room_type                       29202 non-null  object 
 9   price                           29202 non-null  float64
 10  minimum_nights                  29202 non-null  float64
 11  number_of_reviews               29202 non-null  float64
 12  last_review                     

In [4]:
print("\
First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park   2787.0   
1  2595                             Skylit Midtown Castle   2845.0   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !   4632.0   
3  3831                   Cozy Entire Floor of Brownstone   4869.0   
4  5022  Entire Apt: Spacious Studio/Loft by central park   7192.0   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room  149.0        

In [5]:
print("\
Summary statistics:")
print(df.describe())


Summary statistics:
                 id       host_id      latitude     longitude         price  \
count  2.920300e+04  2.920200e+04  29202.000000  29202.000000  29202.000000   
mean   1.141531e+07  3.418226e+07     40.729139    -73.954652    148.219095   
std    6.882951e+06  4.006633e+07      0.053707      0.041836    226.261213   
min    2.539000e+03  2.571000e+03     40.499790    -74.242850      0.000000   
25%    5.371022e+06  4.843862e+06     40.689350    -73.982610     70.000000   
50%    1.152941e+07  1.812999e+07     40.722750    -73.956745    109.000000   
75%    1.760850e+07  4.805519e+07     40.763847    -73.939863    174.000000   
max    2.240994e+07  1.640484e+08     40.911690    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    29202.000000       29202.000000       24373.000000   
mean         7.061468          32.499418           1.019477   
std         22.488124          54.200405           1.389368   
min          1.0

In [6]:
print("\
Missing values:")
print(df.isnull().sum())

Missing values:
id                                   0
name                                16
host_id                              1
host_name                           19
neighbourhood_group                  1
neighbourhood                        1
latitude                             1
longitude                            1
room_type                            1
price                                1
minimum_nights                       1
number_of_reviews                    1
last_review                       4830
reviews_per_month                 4830
calculated_host_listings_count       1
availability_365                     1
dtype: int64


In [7]:
print("\
Number of duplicate rows:")
print(df.duplicated().sum())

Number of duplicate rows:
0


In [8]:

df['name'].fillna('Unknown', inplace=True)
df['host_name'].fillna('Unknown', inplace=True)

df['last_review'].fillna('1970-01-01', inplace=True)

df['reviews_per_month'].fillna(0, inplace=True)

print("\
Missing values after imputation:")
print(df.isnull().sum())

Missing values after imputation:
id                                0
name                              0
host_id                           1
host_name                         0
neighbourhood_group               1
neighbourhood                     1
latitude                          1
longitude                         1
room_type                         1
price                             1
minimum_nights                    1
number_of_reviews                 1
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    1
availability_365                  1
dtype: int64


In [9]:
import pandas as pd
import numpy as np
from datetime import datetime



In [10]:

df = pd.read_csv('AB_NYC_2019.csv')


In [11]:
df['name'] = df['name'].str.strip().str.title()
df['host_name'] = df['host_name'].str.strip().str.title()
df['neighbourhood'] = df['neighbourhood'].str.strip().str.title()
df['neighbourhood_group'] = df['neighbourhood_group'].str.strip().str.title()


In [12]:
df['price'] = df['price'].astype(int)
df['last_review'] = pd.to_datetime(df['last_review'], format='%Y-%m-%d', errors='coerce')
df['latitude'] = df['latitude'].round(5)
df['longitude'] = df['longitude'].round(5)
df['reviews_per_month'] = df['reviews_per_month'].round(2)

In [13]:
print(df.head())
print("\
Dataset Info:")
print(df.info())

     id                                              name  host_id  \
0  2539                Clean & Quiet Apt Home By The Park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               The Village Of Harlem....New York !     4632   
3  3831                   Cozy Entire Floor Of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft By Central Park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  Lisaroxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room    149               1                  9  20

In [14]:
categorical_columns = ['neighbourhood_group', 'room_type']
for col in categorical_columns:
    print(f"\
Unique values in {col}:")
    print(df[col].unique())

numerical_columns = ['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
for col in numerical_columns:
    print(f"\
Range of {col}:")
    print(f"Min: {df[col].min()}, Max: {df[col].max()}")

print("\
Date range of last_review:")
print(f"Earliest: {df['last_review'].min()}, Latest: {df['last_review'].max()}")

Unique values in neighbourhood_group:
['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']
Unique values in room_type:
['Private room' 'Entire home/apt' 'Shared room']
Range of price:
Min: 0, Max: 10000
Range of minimum_nights:
Min: 1, Max: 1250
Range of number_of_reviews:
Min: 0, Max: 629
Range of reviews_per_month:
Min: 0.01, Max: 58.5
Range of calculated_host_listings_count:
Min: 1, Max: 327
Range of availability_365:
Min: 0, Max: 365
Date range of last_review:
Earliest: 2011-03-28 00:00:00, Latest: 2019-07-08 00:00:00


In [15]:

def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

numerical_columns = ['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

outliers_dict = {}
for col in numerical_columns:
    outliers_dict[col] = detect_outliers(df, col)
    print(f"Outliers in {col}:")
    print(outliers_dict[col].shape)

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

for col in numerical_columns:
    df = remove_outliers(df, col)

print(df.head())
print("\
Dataset Info after removing outliers:")
print(df.info())

Outliers in price:
(2972, 16)
Outliers in minimum_nights:
(6607, 16)
Outliers in number_of_reviews:
(6021, 16)
Outliers in reviews_per_month:
(1793, 16)
Outliers in calculated_host_listings_count:
(7081, 16)
Outliers in availability_365:
(0, 16)
      id                                              name  host_id host_name  \
4   5022  Entire Apt: Spacious Studio/Loft By Central Park     7192     Laura   
10  5295                  Beautiful 1Br On Upper West Side     7702      Lena   
20  7801                  Sweet And Spacious Brooklyn Loft    21207     Chaya   
25  8505                Sunny Bedroom Across Prospect Park    25326   Gregory   
27  9357                              Midtown Pied-A-Terre    30193     Tommi   

   neighbourhood_group    neighbourhood  latitude  longitude        room_type  \
4            Manhattan      East Harlem  40.79851  -73.94399  Entire home/apt   
10           Manhattan  Upper West Side  40.80316  -73.96545  Entire home/apt   
20            Brooklyn  