In [None]:
Data Integrity

In [11]:
import pandas as pd

# Load the dataset
data = pd.read_csv('AB_NYC_2019.csv')

# Checking the columns to identify the correct ones
print("Columns in the dataset:", data.columns)

# Checking for data types and null values to assess data integrity
print("\nData Info (Types & Missing Values):")
print(data.info())


Columns in the dataset: Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

Data Info (Types & Missing Values):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                 

In [None]:
Missing Data Handling

In [12]:
# Task 2: Missing Data Handling - Checking for missing values and imputing where necessary
print("\nMissing Values in the Dataset:")
missing_values = data.isnull().sum()
print(missing_values)

# Impute missing 'reviews_per_month' with the mean value
if 'reviews_per_month' in data.columns:
    data['reviews_per_month'].fillna(data['reviews_per_month'].mean(), inplace=True)
    print("\nMissing values in 'reviews_per_month' have been imputed with the mean.")
else:
    print("'reviews_per_month' column not found in dataset")

# Impute missing 'name' and 'host_name' with a placeholder value
if 'name' in data.columns:
    data['name'].fillna('Unknown', inplace=True)
    print("\nMissing 'name' values have been imputed with 'Unknown'.")
else:
    print("'name' column not found in dataset")

if 'host_name' in data.columns:
    data['host_name'].fillna('Unknown', inplace=True)
    print("\nMissing 'host_name' values have been imputed with 'Unknown'.")
else:
    print("'host_name' column not found in dataset")



Missing Values in the Dataset:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

Missing values in 'reviews_per_month' have been imputed with the mean.

Missing 'name' values have been imputed with 'Unknown'.

Missing 'host_name' values have been imputed with 'Unknown'.


In [None]:
Duplicate Removal

In [13]:
# Task 3: Duplicate Removal - Identifying and removing duplicates
print("\nChecking for duplicate rows in the dataset:")
duplicates = data.duplicated().sum()
if duplicates > 0:
    print(f"There are {duplicates} duplicate rows. Removing them...")
    data.drop_duplicates(inplace=True)
else:
    print("No duplicate rows found.")



Checking for duplicate rows in the dataset:
No duplicate rows found.


In [None]:
Standardization

In [14]:
# Task 4: Standardization - Ensuring consistent formatting and units across columns
# Standardizing column names (convert to lowercase and replace spaces with underscores)
data.columns = data.columns.str.lower().str.replace(' ', '_')
print("\nColumn names have been standardized.")

# Standardize 'price' to represent in dollars (if needed, ensure no negative values)
if 'price' in data.columns:
    data['price'] = data['price'].apply(lambda x: max(x, 0))  # Making sure no negative values
    print("\n'Price' values have been standardized to positive values.")
else:
    print("'price' column not found in dataset")



Column names have been standardized.

'Price' values have been standardized to positive values.


In [None]:
Outlier Detection

In [15]:
# Task 5: Outlier Detection - Identifying and handling outliers
# Example: Detecting outliers in 'price' and 'reviews_per_month' using the IQR method
def detect_outliers(df, column):
    # Calculate the Interquartile Range (IQR)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Detect outliers in 'price'
if 'price' in data.columns:
    price_outliers = detect_outliers(data, 'price')
    print(f"\nOutliers detected in 'price':")
    print(price_outliers)
else:
    print("'price' column not found in dataset")

# Detect outliers in 'reviews_per_month'
if 'reviews_per_month' in data.columns:
    reviews_outliers = detect_outliers(data, 'reviews_per_month')
    print(f"\nOutliers detected in 'reviews_per_month':")
    print(reviews_outliers)
else:
    print("'reviews_per_month' column not found in dataset")

# You can choose to remove or handle outliers here as necessary
# Example: Remove outliers in price
data = data[~data['price'].isin(price_outliers['price'])]

# Save the cleaned data (Optional)
# data.to_csv('cleaned_AB_NYC_2019.csv', index=False)



Outliers detected in 'price':
             id                                               name    host_id  \
61        15396                 Sunny & Spacious Chelsea Apartment      60278   
85        19601                perfect for a family or small group      74303   
103       23686  2000 SF 3br 2bath West Village private  townhouse      93790   
114       26933  2 BR / 2 Bath Duplex Apt with patio! East Village      72062   
121       27659                   3 Story Town House in Park Slope     119588   
...         ...                                                ...        ...   
48758  36420289    Rustic Garden House Apt, 2 stops from Manhattan   73211393   
48833  36450896   Brand New 3-Bed Apt in the Best Location of FiDi   29741813   
48839  36452721  Massage Spa. Stay overnight. Authors Artist dr...  274079964   
48842  36453160  LUXURY MANHATTAN PENTHOUSE+HUDSON RIVER+EMPIRE...  224171371   
48856  36457700  Large 3 bed, 2 bath , garden , bbq , all you need   66993395 