In [None]:
import pandas as pd

In [None]:
# Step 1: Import the dataset
nyc_data = pd.read_csv('AB_NYC_2019.csv')

In [None]:
# Step 2: Handle Missing Data
nyc_data_cleaned = nyc_data.dropna(subset=['name', 'host_name'])

In [None]:
# Fill missing values in 'reviews_per_month' with 0, assuming no reviews were given (avoids warning with .loc)
nyc_data_cleaned.loc[:, 'reviews_per_month'] = nyc_data_cleaned['reviews_per_month'].fillna(0)

In [None]:
# Step 3: Handle Duplicate Records
nyc_data_cleaned = nyc_data_cleaned.drop_duplicates(subset='id', keep='first')

In [None]:
# Step 4: Standardization (Consistency Check)
nyc_data_cleaned = nyc_data_cleaned[nyc_data_cleaned['price'] >= 0]

In [None]:
# Step 5: Outlier Detection (e.g., Price and Minimum Nights)
price_upper_limit = nyc_data_cleaned['price'].quantile(0.99)
min_nights_upper_limit = nyc_data_cleaned['minimum_nights'].quantile(0.99)

In [None]:
# Remove outliers in 'price' and 'minimum_nights'
nyc_data_cleaned = nyc_data_cleaned[nyc_data_cleaned['price'] <= price_upper_limit]
nyc_data_cleaned = nyc_data_cleaned[nyc_data_cleaned['minimum_nights'] <= min_nights_upper_limit]

In [None]:
# Step 5: Standardize 'last_review' date column (if necessary)
nyc_data_cleaned.loc[:, 'last_review'] = pd.to_datetime(nyc_data_cleaned['last_review'], errors='coerce')

In [None]:
# Final Check: Display cleaned dataset info and preview
print("Cleaned Data Info:\n", nyc_data_cleaned.info())
print("Preview of Cleaned Data:\n", nyc_data_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
Index: 47939 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              47939 non-null  int64  
 1   name                            47939 non-null  object 
 2   host_id                         47939 non-null  int64  
 3   host_name                       47939 non-null  object 
 4   neighbourhood_group             47939 non-null  object 
 5   neighbourhood                   47939 non-null  object 
 6   latitude                        47939 non-null  float64
 7   longitude                       47939 non-null  float64
 8   room_type                       47939 non-null  object 
 9   price                           47939 non-null  int64  
 10  minimum_nights                  47939 non-null  int64  
 11  number_of_reviews               47939 non-null  int64  
 12  last_review                     38330