In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pickle
import joblib

# Assume `arima_forecast` is a DataFrame containing ARIMA predictions
# For simplicity, let's assume it has a 'date' and 'general_forecast' for market conditions
listings_df = pd.read_csv('Data/listings.csv')
calendar_df = pd.read_csv('Data/calendar.csv')


listings_df = listings_df.dropna(subset=['bathrooms', 'bedrooms', 'beds', 'price', 'host_acceptance_rate', 'host_response_rate'])

# ckean data by filling empty vals with f
listings_df['host_is_superhost'].fillna('f', inplace=True)
listings_df['has_availability'].fillna('f', inplace=True)

#clean % sign
listings_df['host_acceptance_rate'] = listings_df['host_acceptance_rate'].str.replace('%', '').astype(float)
listings_df['host_response_rate'] = listings_df['host_response_rate'].str.replace('%', '').astype(float)

#clean price
listings_df['price'] = listings_df['price'].astype(str).str.replace("[$,]", "", regex=True).astype(float)



label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',  'host_verifications', 'neighbourhood_cleansed', 'property_type', 'room_type','neighbourhood_cleansed', 'property_type', 'room_type', 'has_availability', 'instant_bookable']

encoders ={}

# Apply label encoder to each categorical column
for col in categorical_cols:
    label_encoder = LabelEncoder()
    listings_df[col] = label_encoder.fit_transform(listings_df[col])
    encoders[col] = label_encoder

joblib.dump(label_encoder, 'label_encoder.pkl')  # Save the label encoder for future use



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  listings_df['host_is_superhost'].fillna('f', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  listings_df['has_availability'].fillna('f', inplace=True)


['label_encoder.pkl']

In [None]:

# Convert date columns to datetime
calendar_df['date'] = pd.to_datetime(calendar_df['date'])
listings_df['last_scraped'] = pd.to_datetime(listings_df['last_scraped'])

# Clean price columns from both datasets
calendar_df['price'] = calendar_df['price'].astype(str).str.replace("[$,]", "", regex=True).astype(float)
listings_df['price'] = listings_df['price'].astype(str).str.replace("[$,]", "", regex=True).astype(float)

calendar_df = calendar_df.drop('adjusted_price', axis=1)

# Ensure matching column names for merging
calendar_df.rename(columns={'listing_id': 'id'}, inplace=True)
merged_df = pd.merge(calendar_df, listings_df, on='id')




# Aggregating prices by date for ARIMA modeling
price_by_date = calendar_df.groupby('date')['price'].mean()

# Fit ARIMA model (consider seasonal adjustments if needed)
sarima_model = SARIMAX(
    price_by_date,
    order=(1, 1, 1),  # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12),  # Seasonal order (P, D, Q, s)
    enforce_stationarity=False,
    enforce_invertibility=False,
)

sarima_result = sarima_model.fit()

joblib.dump(sarima_result, 'sarimax_model.pkl')

future_forecast = sarima_result.get_forecast(steps=90)  # Adjust steps as needed


last_date = price_by_date.index.max()

# Generate dates starting from the day after the last date
forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90, freq='D')

# Assuming 'future_forecast' is from an ARIMA model and contains predicted values
forecast_df = pd.DataFrame({
    'date': forecast_dates,
    'forecast_price': future_forecast.predicted_mean
})


# Aggregate prices by date using the mean (you could use median if it makes more sense)
average_price_by_date = calendar_df.groupby('date')['price'].mean().reset_index()

# Merge the average prices with the forecast_df
final_df = pd.merge(average_price_by_date, forecast_df, on='date', how='left')

# Rename columns for clarity
final_df.rename(columns={'price': 'average_actual_price', 'forecast_price': 'predicted_price'}, inplace=True)

# Save the DataFrame to a CSV file
final_df.to_csv('sarima_predictions_with_average_actuals.csv', index=False)


merged_df['date'] = pd.to_datetime(merged_df['date'])

merged_df = pd.merge_asof(merged_df.sort_values('date'), forecast_df.sort_values('date'), on='date', direction='nearest')

# Prepare features and target variable
X = merged_df[['forecast_price', 'accommodates', 'bedrooms', 'beds', 'room_type', 'property_type', 'bathrooms', 'latitude', 'longitude', 'host_is_superhost', 'instant_bookable', 'host_verifications', 'number_of_reviews']]  # plus other relevant features
y = merged_df['price_x']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForest model
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
rf_explained_variance = explained_variance_score(y_test, rf_predictions)

# Print evaluation metrics for each model
print(f'Random Forest MSE: {rf_mse}, RMSE: {rf_rmse}, MAE: {rf_mae}, R²: {rf_r2}, Explained Variance: {rf_explained_variance}')

joblib.dump(rf_model, 'random_forest_model.pkl')

In [2]:
# Calculate mean price for each date in calendar_df
calendar_mean_prices = calendar_df.groupby('date')['price'].mean().reset_index()

# Specific dates you're interested in
specific_dates = pd.to_datetime(['2025-11-01', '2025-11-15', '2025-12-01'])

# Initialize an empty DataFrame to hold the specific forecasts
specific_forecasts = pd.DataFrame()

# Check if specific dates are in the forecast DataFrame
if any(forecast_df['date'].isin(specific_dates)):
    specific_forecasts = forecast_df[forecast_df['date'].isin(specific_dates)]
    
    # Merge with mean prices from calendar_df where specific dates are found
    specific_forecasts = specific_forecasts.merge(calendar_mean_prices, on='date', how='left')
    specific_forecasts.rename(columns={'price': 'mean_price'}, inplace=True)
    lis = specific_forecasts["forecast_price"].to_list()

elif any(calendar_mean_prices['date'].isin(specific_dates)):
        specific_forecasts = calendar_mean_prices[calendar_mean_prices['date'].isin(specific_dates)]
        lis = specific_forecasts["price"].to_list()




In [3]:

test_data = pd.DataFrame({
    'forecast_price': lis,
    'accommodates': [2, 4, 2],
    'bedrooms': [1, 3, 2],
    'beds': [1, 3, 2],
    'room_type': [0, 1, 3],
    'property_type': [12, 21, 3],
    'bathrooms': [1, 2, 1.5],
    'latitude': [40.7128, 34.0522, 37.7749],
    'longitude': [-74.0060, -118.2437, -122.4194],
    'host_is_superhost': label_encoder.transform(['t', 'f', 'f']),
    'instant_bookable': label_encoder.transform(['t', 't', 'f']),
    'host_verifications': [2, 3, 1],
    'number_of_reviews': [10, 50, 5]
})

rf_predictions = rf_model.predict(test_data)
print("Random Forest Predictions:", rf_predictions)


Random Forest Predictions: [259.41 150.09 443.41]
