# Airbnb NYC

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('AB_NYC_2019.csv')

print("ðŸ”¹ Original dataset shape:", df.shape)
print("\nðŸ”¹ Preview:")
print(df.head())

ðŸ”¹ Original dataset shape: (48895, 16)

ðŸ”¹ Preview:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Pr

### CHECK DATA INTEGRITY 

In [5]:
print("\n COLUMN DATA TYPES")
print(df.dtypes)

print("\n CHECK FOR DUPLICATES")
print(df.duplicated().sum())

print("\n CHECK MISSING VALUES")
print(df.isnull().sum())


 COLUMN DATA TYPES
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

 CHECK FOR DUPLICATES
0

 CHECK MISSING VALUES
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitu

In [6]:
# Convert last_review to datetime
df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")

# Fill missing reviews_per_month with 0 (no reviews â†’ 0 frequency)
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# Fill missing last_review with earliest date in dataset
df["last_review"] = df["last_review"].fillna(df["last_review"].min())

print("\nðŸ”§ Missing values after handling:")
print(df.isnull().sum())


ðŸ”§ Missing values after handling:
id                                 0
name                              16
host_id                            0
host_name                         21
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64


In [8]:
df = df.drop_duplicates()
print("\n After duplicate removal:", df.shape)


 After duplicate removal: (48895, 16)


#### STANDARDIZATION 

In [9]:
# Make text values consistent (lowercase)
df["host_name"] = df["host_name"].str.strip().str.lower()
df["neighbourhood_group"] = df["neighbourhood_group"].str.lower()
df["neighbourhood"] = df["neighbourhood"].str.lower()
df["room_type"] = df["room_type"].str.lower()

print("\n String standardization complete!")


 String standardization complete!


#### OUTLIER DETECTION & CLEANING

In [10]:
# Detect price outliers using IQR method
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR

# Cap prices to reduce skew
df["price"] = np.where(df["price"] > upper_limit, upper_limit, df["price"])
df["price"] = np.where(df["price"] < lower_limit, lower_limit, df["price"])

print("\nðŸ’° Price outliers capped!")
print(df["price"].describe())



ðŸ’° Price outliers capped!
count    48895.000000
mean       132.979753
std         83.530504
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max        334.000000
Name: price, dtype: float64


In [12]:
cleaned_path = "AB_NYC_2019_cleaned.csv"
df.to_csv(cleaned_path, index=False)

# End project