In [4]:
# Cell 1: Import and Load
import pandas as pd
import numpy as np

# Load the raw data again
df = pd.read_csv('MinoAI_dataset.xlsx - MinoAI dataset.csv')

# Quick clean (repeat from step 1)
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
df.dropna(subset=['name', 'host_name'], inplace=True)

print("Data Loaded. Rows:", len(df))

Data Loaded. Rows: 48858


In [5]:
# Feature Engineering - Time
# Convert the 'last_review' column to real Date objects (instead of text)
df['last_review'] = pd.to_datetime(df['last_review'], format='%d/%m/%Y', errors='coerce')

# We need a reference point for "Today".
# Since this dataset is from 2019, we can't use the real 2025 date (everything would look old!).
# We should pretend "Today" is the most recent date found in the dataset.
latest_date = df['last_review'].max()
print(f"Dataset 'Current' Date: {latest_date}")

# Calculate the difference: (Latest Date - Review Date)
df['days_since_review'] = (latest_date - df['last_review']).dt.days

# Handle listings that have NEVER been reviewed (NaN)
# Logic: If it has never been reviewed, we treat it as the "oldest" possible listing + a penalty
# We fill the empty spots with the maximum days found + 30 days
max_days = df['days_since_review'].max()
df['days_since_review'] = df['days_since_review'].fillna(max_days + 30)

print("Created 'days_since_review'. Lower number = More recent activity.")

Dataset 'Current' Date: 2019-07-08 00:00:00
Created 'days_since_review'. Lower number = More recent activity.
