# Excess Code

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

class DataPreprocessor:
    def __init__(self, impute_strategy="mean", encode_type="onehot"):
        self.impute_strategy = impute_strategy
        self.encode_type = encode_type
        self.analyzer = SentimentIntensityAnalyzer()

        self.imputers = {
            "host_response_rate": SimpleImputer(strategy="constant", fill_value=0),
            "review_scores_rating": SimpleImputer(strategy="median"),
            "first_review": SimpleImputer(strategy="constant", fill_value="2000-01-01"),
            "last_review": SimpleImputer(strategy="constant", fill_value="2000-01-01"),
            "thumbnail_url": SimpleImputer(strategy="constant", fill_value="missing_thumbnail"),
            "neighbourhood": SimpleImputer(strategy="constant", fill_value="missing"),
            "zipcode": SimpleImputer(strategy="constant", fill_value="00000"),
            "bathrooms": SimpleImputer(strategy="median"),
            "bedrooms": SimpleImputer(strategy="median"),
            "beds": SimpleImputer(strategy="median"),
            "host_has_profile_pic": SimpleImputer(strategy="constant", fill_value="f"),
            "host_identity_verified": SimpleImputer(strategy="constant", fill_value="f"),
            "host_since": SimpleImputer(strategy="constant", fill_value="2000-01-01"),
        }

        self.encoders = {
            "mapping": {
                "room_type": {
                    "Private room": 0,
                    "Entire home/apt": 1,
                    "Shared room": 2
                },
                "bed_type": {
                    "Real Bed": 5,
                    "Futon": 4,
                    "Pull-out Sofa": 3,
                    "Airbed": 2,
                    "Couch": 1,
                },
                "cancellation_policy_map_s": {
                    'flexible': 0.0,
                    'moderate': 0.33,
                    'strict': 0.66,
                    'super_strict_30': 0.83,
                    'super_strict_60': 1.0
                },
                "cancellation_policy_map_f": {
                    'flexible': 1.0,
                    'moderate': 0.66,
                    'strict': 0.33,
                    'super_strict_30': 0.16,
                    'super_strict_60': 0.0
                }
            }
        }

        self.bool_cols = ["host_has_profile_pic", "host_identity_verified", "instant_bookable"]
        self.map_cols = ["bed_type", "room_type"]

        self.city_sentiment = {
            "NYC": 0.75, "LA": 0.78, "SF": 0.85, "DC": 0.73, "Chicago": 0.60, "Boston": 0.90
        }

        self.city_expense_worth = {
            "NYC": 0.55, "LA": 0.58, "SF": 0.50, "DC": 0.78, "Chicago": 0.72, "Boston": 0.75
        }
        self.city_centers = {
            'NYC': (40.7549, -73.984),
            'LA': (34.0557, -118.2488),
            'SF': (37.7876, -122.4066),
            'DC': (38.9037, -77.0363),
            'Chicago': (41.8757, -87.6243),
            'Boston': (42.3555, -71.0565)
        }

    def sentiment_score(self, X):
        def compute_sentiment(text):
            if not isinstance(text, str) or text.strip() == "":
                return 0
            return self.analyzer.polarity_scores(text)['compound']
        X['sentiment'] = X['description'].apply(compute_sentiment)
        return X

    def objectivity_score(self, X):
        def compute_obj(text):
            if not isinstance(text, str): return 0
            return 1 - TextBlob(text).sentiment.subjectivity
        X['objectivity'] = X['description'].apply(compute_obj)
        return X

    def combine_sentiment_subjectivity(self, sentiment, objectivity, sentiment_weight=0.7, objectivity_weight=0.3):
        sentiment_norm = (sentiment + 1) / 2
        return sentiment_weight * sentiment_norm + objectivity_weight * objectivity

    def day_since(self, start, end):
        start = pd.to_datetime(start, errors="coerce")
        end = pd.to_datetime(end, errors="coerce")
        return (end - start).days if pd.notnull(start) and pd.notnull(end) else -1
    
    def _compute_amenity_score(self, X, top_k=30):
        # Clean and standardize
        pet_map = [
        "Pets live on this property", "Pets allowed", "Dog(s)", "Cat(s)", "Other pet(s)"
        ]
        amenities_map = {
            "Wireless Internet": "Internet",
            "Dryer": "Dryer/Washer",
            "Washer": "Dryer/Washer",
            "Dishwasher": "Dryer/Washer",
            "Central Heating": "Heating",
            **{p: "Pet-Friendly" for p in pet_map}
        }

        def map_amenities(amenity_list):
            return [amenities_map.get(a.strip().strip('"'), a.strip().strip('"')) for a in amenity_list]

        X = X.copy()
        X['standard_amenities'] = X['split_amenities'].apply(map_amenities)
        X['amenities_str'] = X['standard_amenities'].apply(lambda x: ','.join(x))

        vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','), lowercase=False)
        tfidf = vectorizer.fit_transform(X['amenities_str'])
        tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out(), index=X.index)

        top_amenities = tfidf_df.mean().sort_values(ascending=False).head(top_k).index
        tfidf_top_df = tfidf_df[top_amenities]

        X['amenity_score'] = tfidf_top_df.sum(axis=1)
        scaler = MinMaxScaler()
        X['amenity_score_normalized'] = scaler.fit_transform(X[['amenity_score']])

        return pd.concat([X, tfidf_top_df], axis=1)
    
    def haversine(self, lat1, lon1, lat2, lon2):
        R = 6371  # km
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    def add_distance_to_city_center(self, X, city_col='city', lat_col='latitude', lon_col='longitude'):
        def compute_distance(row):
            city = row[city_col]
            if city in self.city_centers:
                center_lat, center_lon = self.city_centers[city]
                return self.haversine(row[lat_col], row[lon_col], center_lat, center_lon)
            else:
                return np.nan  # If city isn't found
        X = X.copy()
        X['distance_to_city_center'] = X.apply(compute_distance, axis=1)
        return X

    def fit(self, X):
        X = X.copy()
        for col, imputer in self.imputers.items():
            imputer.fit(X[[col]])
            X[col] = imputer.transform(X[[col]]).ravel()

        self.bool_encoders = {}
        for col in self.bool_cols:
            enc = OrdinalEncoder(categories=[["f", "t"]], dtype=int, handle_unknown="use_encoded_value", unknown_value=-1)
            enc.fit(X[[col]])
            self.bool_encoders[col] = enc
        return self

    def transform(self, X):
        X = X.copy()
        for col, imputer in self.imputers.items():
            X[col] = imputer.transform(X[[col]]).ravel()

        X['first_review'] = pd.to_datetime(X['first_review'], errors="coerce")
        X['last_review'] = pd.to_datetime(X['last_review'], errors="coerce")
        X['missing_review_dates'] = X['first_review'].isna() | X['last_review'].isna()

        mask = X['first_review'].notna() & X['last_review'].notna()
        X['review_gap_days'] = np.nan
        X.loc[mask, 'review_gap_days'] = (X.loc[mask, 'last_review'] - X.loc[mask, 'first_review']).dt.days
        X['review_gap_days'] = X['review_gap_days'].fillna(-1)

        X['host_response_rate'] = X['host_response_rate'].astype(str).str.rstrip('%').astype(float)
        X['cleaning_fee'] = X['cleaning_fee'].astype(int)

        for col in self.bool_cols:
            if col not in X.columns:
                X[col] = "f"
            X[col] = self.bool_encoders[col].transform(X[[col]]).ravel()

        for col, mapping in self.encoders["mapping"].items():
            if col in X.columns:
                X[col] = X[col].map(mapping)

        if 'latitude' in X.columns:
            X['lat_bin'] = pd.cut(X['latitude'], bins=10)
        if 'longitude' in X.columns:
            X['long_bin'] = pd.cut(X['longitude'], bins=10)

        if 'amenities' in X.columns:
            X['split_amenities'] = X['amenities'].fillna('').apply(lambda x: str(x).strip("{}").split(','))
            X['n_amenities'] = X['amenities'].fillna('').apply(lambda x: len(str(x).strip("{}").split(',')))

        X = self.sentiment_score(X)
        X = self.objectivity_score(X)
        X['description_score'] = X.apply(
            lambda row: self.combine_sentiment_subjectivity(row['sentiment'], row['objectivity']), axis=1)

        X['luxury_policy_flag'] = X['cancellation_policy'].isin(['super_strict_30', 'super_strict_60']).astype(int)

        X['city_value_score'] = X['city'].map(self.city_sentiment)
        X['city_expense_score'] = X['city'].map(self.city_expense_worth)

        X['days_between_reviews'] = X.apply(
            lambda row: self.day_since(row['first_review'], row['last_review']), axis=1)
        
        most_recent_possible = pd.to_datetime("2017-10-04")

        X['host_tenure'] = X.apply(
            lambda row: self.day_since(row['host_since'], row['last_review'])
            if pd.notnull(row['last_review']) and 
                pd.to_datetime(row['last_review']) >= pd.to_datetime(row['host_since'])
            else self.day_since(row['host_since'], most_recent_possible),
            axis=1
        )
        X = self.add_distance_to_city_center(X)

        if 'split_amenities' in X.columns:
            X = self._compute_amenity_score(X)
        drop_cols = ['id', 'name', 'description', 'thumbnail_url', 'neighbourhood', 'amenities', 'first_review', 'host_since', 'last_review', 'zipcode', 'missing_review_dates']
        X = X.drop(columns=drop_cols, errors='ignore')

        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import optuna

class ClusterFit(BaseEstimator, RegressorMixin):
    def __init__(self, cluster_features, n_clusters=4, model_class=GradientBoostingRegressor, model_params=None):
        self.cluster_features = cluster_features
        self.n_clusters = n_clusters
        self.model_class = model_class
        self.model_params = model_params or {}

        self.scaler = StandardScaler()
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.models = {}  # cluster_id -> trained model

    def fit(self, X, y):
        # Scale cluster features and fit KMeans
        X_cluster = self.scaler.fit_transform(X[self.cluster_features])
        clusters = self.kmeans.fit_predict(X_cluster)
        self.cluster_assignments_ = clusters
        self.unique_clusters_ = np.unique(clusters)

        self.X_test_all = []
        self.y_test_all = []

        # Fit separate model per cluster
        for cluster in self.unique_clusters_:
            idx = clusters == cluster
            X_cluster_data = X[idx].drop(columns=['log_price', 'cluster_label'], errors='ignore')
            y_cluster = y[idx]

            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: self._objective(trial, X_cluster_data, y_cluster), n_trials=15)
            print("Best RMSE:", study.best_value)
            print("Best hyperparameters:", study.best_params)
            best_params = study.best_params

            X_train, X_test, y_train, y_test = train_test_split(
                X_cluster_data, y_cluster, test_size=0.2, random_state=42
            )
            
            model = self.model_class(**self.model_params)
            model.fit(X_train, y_train)
            self.models[cluster] = model

            self.X_test_all.append(X_test)
            self.y_test_all.append(y_test)

        return self
    def _objective(self, trial, X_cluster_data, y_cluster):
        X_train, X_val, y_train, y_val = train_test_split(X_cluster_data, y_cluster, test_size=0.2, random_state=42)

        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'random_state': 42
        }

        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        return rmse
    
    def predict(self, X):
        X_cluster = self.scaler.transform(X[self.cluster_features])
        clusters = self.kmeans.predict(X_cluster)

        preds = np.zeros(len(X))
        for cluster in self.unique_clusters_:
            idx = clusters == cluster
            if np.sum(idx) == 0:
                continue
            X_cluster_data = X.iloc[idx].drop(columns=['log_price', 'cluster_label'], errors='ignore')
            preds[idx] = self.models[cluster].predict(X_cluster_data)
        return preds

    def score(self, X, y):
        y_pred = self.predict(X)
        return np.sqrt(mean_squared_error(y, y_pred))  # negative RMSE for use in cross_val_score

In [None]:
for cluster_id in [0, 1, 2]:
    mask = cluster_labels == cluster_id
    y_cluster = y[mask]
    preds_cluster = preds[mask]
    
    se = ((y_cluster - preds_cluster) ** 2).sum()
    total_squared_error += se
    total_samples += len(y_cluster)

overall_rmse_from_clusters = np.sqrt(total_squared_error / total_samples)
print(f"Verified Overall RMSE from cluster breakdown: {overall_rmse_from_clusters:.4f}")


In [None]:
class ModelPerCluster:
    def __init__(self, features, model_types, n_trials=30, timeout=300):
        self.features = features
        self.n_clusters = 3
        self.model_types = model_types
        self.n_trials = n_trials
        self.timeout = timeout

        self.scalar = StandardScaler()
        self.cluster_models = {}
        self.cluster_studies = {}
        self.cluster_rmses = {}
        self.cluster_model_types = {}

    def fit_cluster_models(self, X, y, cluster_labels):
        for cluster_id in range(self.n_clusters):
            print(f"\n>>> Tuning cluster {cluster_id}")

            mask = cluster_labels == cluster_id
            X_cluster = X[mask].copy()
            y_cluster = y[mask].copy()

            best_rmse = float('inf')
            best_model = None
            best_study = None
            best_model_type = None

            optuna_params = {}

            for name, (model_class, param_space) in self.model_types.items():
                if name == "Stacking":
                    continue

                def theobjective(trial):
                    params = param_space(trial)
                    X_train, X_val, y_train, y_val = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
                    model = model_class(**params)
                    model.fit(X_train, y_train)
                    preds = model.predict(X_val)
                    rmse = np.sqrt(mean_squared_error(y_val, preds))
                    return rmse
                
                study = optuna.create_study(direction="minimize")
                study.optimize(theobjective, n_trials=self.n_trials, timeout=self.timeout)

                if study.best_value < best_rmse:
                    best_rmse = study.best_value
                    best_study = study
                    best_model_type = name
                    best_model = model_class(**study.best_params)
                    best_model.fit(X_cluster, y_cluster)
                
                optuna_params[name] = study.best_params

            if cluster_id in [1, 2]:
                base_models = []
                for base_name in ["RandomForest", "GradientBoosting", "Ridge"]:
                    if base_name in self.model_types:
                        model_class, param_space = self.model_types[base_name]
                        params = optuna_params.get(base_name, {})
                        base_model = model_class(**params)
                        base_model.fit(X_cluster, y_cluster)
                        base_models.append((base_name, base_model))

                stacked_model = StackingRegressor(estimators=base_models, final_estimator=Ridge())
                stacked_model.fit(X_cluster, y_cluster)
                stacked_preds = stacked_model.predict(X_cluster)
                stacked_rmse = np.sqrt(mean_squared_error(y_cluster, stacked_preds))

                if stacked_rmse < best_rmse:
                    best_rmse = stacked_rmse
                    best_model_type = "Stacking"
                    best_model = stacked_model
                    best_study = None

            self.cluster_models[cluster_id] = best_model
            self.cluster_studies[cluster_id] = best_study
            self.cluster_rmses[cluster_id] = best_rmse
            self.cluster_model_types[cluster_id] = best_model_type
            
    def predicts(self, X, cluster_labels):
        preds = np.zeros(len(X))
        for cluster_id in range(self.n_clusters):
            mask = cluster_labels == cluster_id
            if np.sum(mask) == 0:
                continue
            X_cluster = X[mask]
            model = self.cluster_models[cluster_id]
            preds[mask] = model.predict(X_cluster)
        return preds
        
    def report(self):
        print("\n--- Cluster Summary ---")
        for cluster_id in range(self.n_clusters):
            print(f"Cluster {cluster_id}: {self.cluster_model_types[cluster_id]} | RMSE: {self.cluster_rmses[cluster_id]:.4f}")