PROJECT 3 PROPOSAL LEVEL 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

df = pd.read_csv('AB_NYC_2019.csv')

In [2]:
print("Initial Dataset Shape:", df.shape)
print("\nInitial Data Info:")
print(df.info())
print("\nInitial Missing Values:")
print(df.isnull().sum())

Initial Dataset Shape: (48895, 16)

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 

In [3]:
print("\nDuplicate Rows:", df.duplicated().sum())
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)


Duplicate Rows: 0
Shape after removing duplicates: (48895, 16)


In [4]:
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

numeric_columns = ['price', 'minimum_nights', 'number_of_reviews', 
                  'reviews_per_month', 'calculated_host_listings_count', 
                  'availability_365']

for col in numeric_columns:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

categorical_columns = ['name', 'host_name', 'neighbourhood_group', 
                      'neighbourhood', 'room_type']

for col in categorical_columns:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)

In [5]:
df['price'] = df['price'].astype(str).str.replace('$', '').str.replace(',', '').astype(float)

df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

In [None]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

numeric_cols_for_outliers = ['price', 'minimum_nights', 'number_of_reviews']

for col in numeric_cols_for_outliers:
    lower_bound, upper_bound = detect_outliers(df, col)
    # Create outlier plot
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col} (Before)')

    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col} (After)')
    plt.tight_layout()
    plt.show()

In [None]:
df['minimum_nights'] = df['minimum_nights'].abs()
df['availability_365'] = df['availability_365'].clip(0, 365)
df['reviews_per_month'] = df['reviews_per_month'].clip(lower=0)

In [None]:
quality_report = pd.DataFrame({'missing_values': df.isnull().sum(),'unique_values': df.nunique(),'dtype': df.dtypes})
print("\nData Quality Report:")
print(quality_report)

In [None]:
print("\nSummary Statistics:")
print(df.describe())

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 5))
for i, col in enumerate(numeric_columns[:3], 1):
    plt.subplot(1, 3, i)
    sns.histplot(df[col], bins=30)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
df.to_csv('cleaned_dataset.csv', index=False)
print("\nFinal Dataset Shape:", df.shape)
print("\nFinal Data Info:")
print(df.info())

cleaning_summary = {'initial_rows': len(df),'duplicates_removed': len(df) - len(df.drop_duplicates()),
    'missing_values_filled': df.isnull().sum().sum(),'outliers_handled': sum(df[numeric_cols_for_outliers].notna().sum())}

print("\nCleaning Summary:")
for key, value in cleaning_summary.items():
    print(f"{key}: {value}")