In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
# ============================================================
# PART 1: SETUP & DATA LOADING
# ============================================================

In [None]:
# Load calendar dataset from compressed CSV
calendar_df = pd.read_csv('data/calendar.csv.gz')

In [None]:
# ============================================================
# PART 2: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================

In [None]:
# Display dataset preview and missing value analysis
display(calendar_df.head(20))
print("MISSING VALUES OVERVIEW")
print("=" * 50)

missing_count = calendar_df.isnull().sum()
missing_percent = (calendar_df.isnull().sum() / len(calendar_df)) * 100
missing_summary = pd.DataFrame({
    'Missing_Count': missing_count,
    'Missing_Percent': missing_percent
})
print(missing_summary)

In [None]:
# Drop price columns (100% missing, will merge from listings later)
calendar_df = calendar_df.drop(['price', 'adjusted_price'], axis=1)

In [None]:
# ============================================================
# PART 3: DATA CLEANING & PREPROCESSING
# ============================================================

In [None]:
# Convert date to datetime and extract temporal features
calendar_df['date'] = pd.to_datetime(calendar_df['date'])
calendar_df['year'] = calendar_df['date'].dt.year
calendar_df['month'] = calendar_df['date'].dt.month
calendar_df['day_of_week'] = calendar_df['date'].dt.dayofweek  # 0=Monday, 6=Sunday
calendar_df['week_of_year'] = calendar_df['date'].dt.isocalendar().week
calendar_df['is_weekend'] = calendar_df['day_of_week'].isin([5, 6]).astype(bool)

# Convert 'available' to boolean
calendar_df['available'] = calendar_df['available'].map({'t': True, 'f': False})

# Set appropriate data types
calendar_df['listing_id'] = calendar_df['listing_id'].astype('int64')
calendar_df['minimum_nights'] = calendar_df['minimum_nights'].astype('int16')
calendar_df['maximum_nights'] = calendar_df['maximum_nights'].astype('int16')

In [None]:
# ============================================================
# PART 4: FEATURE ENGINEERING
# ============================================================

In [None]:
# Verify data integrity and display cleaned calendar dataset
print("Calendar Dataset Info:")
print(f"Shape: {calendar_df.shape}")
print(f"Any missing values: {calendar_df.isnull().sum().sum()}")
print("\nData types:")
print(calendar_df.dtypes)
print("\nFirst few rows:")
calendar_df.head()

In [None]:
# Check for duplicate rows
print("Duplicate Analysis:")
print("=" * 50)
print(f"Total rows: {len(calendar_df)}")
print(f"Duplicate rows (all columns): {calendar_df.duplicated().sum()}")
print(f"Duplicate rows (listing_id, date): {calendar_df.duplicated(subset=['listing_id', 'date']).sum()}")

if calendar_df.duplicated(subset=['listing_id', 'date']).sum() > 0:
    print("\nRemoving duplicates on (listing_id, date)...")
    calendar_df = calendar_df.drop_duplicates(subset=['listing_id', 'date'], keep='first')
    print(f"New shape after removing duplicates: {calendar_df.shape}")
else:
    print("\nNo duplicates found on (listing_id, date) - dataset is clean!")

#No duplicates found

In [None]:
# Check for extreme values and outliers
print("Extreme Values Analysis:")
print("=" * 50)

# Numeric columns to analyze
numeric_cols = ['minimum_nights', 'maximum_nights', 'month', 'day_of_week', 'week_of_year']

for col in numeric_cols:
    print(f"\n{col}:")
    print(f"  Min: {calendar_df[col].min()}")
    print(f"  Max: {calendar_df[col].max()}")
    print(f"  Mean: {calendar_df[col].mean():.2f}")
    print(f"  Median: {calendar_df[col].median():.2f}")
    print(f"  Std: {calendar_df[col].std():.2f}")

# Check for unrealistic night values
print("\n" + "=" * 50)
print("Unrealistic Values Check:")
print(f"minimum_nights > 365: {(calendar_df['minimum_nights'] > 365).sum()} rows")
print(f"maximum_nights > 1000: {(calendar_df['maximum_nights'] > 1000).sum()} rows")
print(f"minimum_nights > maximum_nights: {(calendar_df['minimum_nights'] > calendar_df['maximum_nights']).sum()} rows")

# Show distribution of extreme values
if (calendar_df['maximum_nights'] > 1000).sum() > 0:
    print(f"\nExamples of maximum_nights > 1000:")
    print(calendar_df[calendar_df['maximum_nights'] > 1000][['listing_id', 'minimum_nights', 'maximum_nights']].head(10))

In [None]:
# ============================================================
# PART 5: DATA QUALITY CHECKS & VALIDATION
# ============================================================

In [None]:
# Fix negative maximum_nights values with placeholder
print("Fixing Negative Values:")
print("=" * 50)
print(f"Rows with negative maximum_nights: {(calendar_df['maximum_nights'] < 0).sum()}")

if (calendar_df['maximum_nights'] < 0).sum() > 0:
    print("Replacing negative maximum_nights with 365 (1 year placeholder)...")
    calendar_df.loc[calendar_df['maximum_nights'] < 0, 'maximum_nights'] = 365
    print(f"Fixed! New max value: {calendar_df['maximum_nights'].max()}")

print(f"\nRows with negative minimum_nights: {(calendar_df['minimum_nights'] < 0).sum()}")
if (calendar_df['minimum_nights'] < 0).sum() > 0:
    print("Replacing negative minimum_nights with 1 (minimum stay)...")
    calendar_df.loc[calendar_df['minimum_nights'] < 0, 'minimum_nights'] = 1
    print(f"Fixed!")

# Verify fix
print(f"\nVerification - minimum_nights range: [{calendar_df['minimum_nights'].min()}, {calendar_df['minimum_nights'].max()}]")
print(f"Verification - maximum_nights range: [{calendar_df['maximum_nights'].min()}, {calendar_df['maximum_nights'].max()}]")

In [None]:
print("DETAILED INVESTIGATION OF PROBLEMS")
print("=" * 80)

# 1. Check negative maximum_nights
negative_max = calendar_df[calendar_df['maximum_nights'] < 0]
print(f"\n1. NEGATIVE MAXIMUM_NIGHTS: {len(negative_max)} rows")
if len(negative_max) > 0:
    print(negative_max[['listing_id', 'date', 'minimum_nights', 'maximum_nights']].head(20))
    print(f"\nUnique listings affected: {negative_max['listing_id'].nunique()}")

# 2. Check extremely high maximum_nights (>2 years)
extreme_max = calendar_df[calendar_df['maximum_nights'] > 730]
print(f"\n2. EXTREMELY HIGH MAXIMUM_NIGHTS (>730 days): {len(extreme_max)} rows")
print(extreme_max['maximum_nights'].describe())
print(f"Unique listings: {extreme_max['listing_id'].nunique()}")

# 3. Check min > max (logical error)
logical_error = calendar_df[calendar_df['minimum_nights'] > calendar_df['maximum_nights']]
print(f"\n3. MIN > MAX (LOGICAL ERROR): {len(logical_error)} rows")
if len(logical_error) > 0:
    print(logical_error[['listing_id', 'minimum_nights', 'maximum_nights']].head(10))

# 4. Check very high minimum_nights
high_min = calendar_df[calendar_df['minimum_nights'] > 365]
print(f"\n4. MINIMUM_NIGHTS > 365 days: {len(high_min)} rows")
if len(high_min) > 0:
    print(high_min[['listing_id', 'minimum_nights', 'maximum_nights']].value_counts().head(10))