In [5]:
import pandas as pd
from scipy.stats import zscore

# Load the New York City dataset
nyc_data = pd.read_csv("nyc_data.csv")

# Display basic information about the dataset
print("Dataset Info:")
print(nyc_data.info())

# Check for any inconsistencies or anomalies in the data
print("\nData Integrity Check:")
#Check for unique values in a name column
print("Unique values in 'name' column:", nyc_data['name'].unique())

# Check for missing values
print("\nMissing Data Check:")
missing_data = nyc_data.isnull().sum()
print(missing_data)

# Impute missing values in 'reviews_per_month' column with the median
median_reviews_per_month = nyc_data['reviews_per_month'].median()
nyc_data['reviews_per_month'].fillna(median_reviews_per_month, inplace=True)

#to handle missingvalue in las_review coloum
mode_last_review=nyc_data['last_review'].mode()
nyc_data['last_review'].fillna(mode_last_review,inplace=True)

#replce empty cell with before value of the cuurent cell
nyc_data.fillna(method='bfill', inplace=True)
print("Data cleaning completed. Modified dataset saved as 'output.csv'.")


# Check if missing values were handled properly
print("\nMissing Data Check after handling:")
missing_data_after = nyc_data.isnull().sum()
print(missing_data_after)

# Check for duplicate records
print("\nDuplicate Removal:")
duplicates_count = nyc_data.duplicated().sum()
print("Number of duplicate records:", duplicates_count)

# Remove duplicate records
nyc_data.drop_duplicates(inplace=True)

#  Remove leading and trailing whitespaces from string columns
nyc_data['name'] =nyc_data['name'].str.strip()

# Check for outliers in price column
print("\nOutlier Detection:")
# Example: Use z-score to detect outliers in 'price' column
z_scores = zscore(nyc_data['price'])
outliers = (z_scores > 3) | (z_scores < -3)
outliers_count = outliers.sum()
print("Number of outliers in 'price' column:", outliers_count)

# Clip outliers in 'minimum_nights' column to a valid range
nyc_data['minimum_nights'] = nyc_data['minimum_nights'].clip(lower=1, upper=180)

# Save the cleaned dataset
nyc_data.to_csv("cleaned_nyc_dataset.csv", index=False)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43427 entries, 0 to 43426
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              43427 non-null  int64  
 1   name                            43411 non-null  object 
 2   host_id                         43426 non-null  float64
 3   host_name                       43405 non-null  object 
 4   neighbourhood_group             43426 non-null  object 
 5   neighbourhood                   43426 non-null  object 
 6   latitude                        43426 non-null  float64
 7   longitude                       43426 non-null  float64
 8   room_type                       43426 non-null  object 
 9   price                           43426 non-null  float64
 10  minimum_nights                  43426 non-null  float64
 11  number_of_reviews               43426 non-null  float64
 12  last_review       