In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import re

class HousePricePredictor:
    def __init__(self):
        self.le = LabelEncoder()
        self.scaler = StandardScaler()
        self.model = None
        self.location_stats = None
        self.features = None
        
    def load_data(self):
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')
        self.rent_data = pd.read_csv('avg_rent.csv')
        self.dist_data = pd.read_csv('dist_from_city_centre.csv')
        
    def clean_size(self, size):
        if pd.isna(size):
            return np.nan
        nums = re.findall(r'\d+', str(size))
        if nums:
            return float(nums[0])
        return np.nan
    
    def clean_total_sqft(self, sqft):
        if pd.isna(sqft):
            return np.nan
        try:
            if '-' in str(sqft):
                nums = [float(x.strip()) for x in sqft.split('-')]
                return sum(nums)/len(nums)
            return float(sqft)
        except:
            return np.nan

    def preprocess_data(self, df, is_train=True):
        df = df.copy()
        
        # Clean size and sqft
        df['size'] = df['size'].apply(self.clean_size)
        df['total_sqft'] = df['total_sqft'].apply(self.clean_total_sqft)
        
        # Remove outliers for training data
        if is_train:
            df = df[df['total_sqft'] < df['total_sqft'].quantile(0.99)]
            df = df[df['bath'] < df['bath'].quantile(0.99)]
        
        # Handle missing values
        df['size'] = df['size'].fillna(df['size'].median())
        df['bath'] = df['bath'].fillna(df['bath'].median())
        df['balcony'] = df['balcony'].fillna(df['balcony'].median())
        
        # Transform area_type
        if is_train:
            df['area_type'] = self.le.fit_transform(df['area_type'])
        else:
            df['area_type'] = self.le.transform(df['area_type'])
        
        df['has_society'] = df['society'].notna().astype(int)
        
        return df
    
    def engineer_features(self, df, is_train=True):
        df = df.copy()
        
        # Merge additional data
        df = df.merge(self.rent_data, on='location', how='left')
        df = df.merge(self.dist_data, on='location', how='left')
        
        # Handle missing values in merged features
        df['avg_2bhk_rent'] = df['avg_2bhk_rent'].fillna(df['avg_2bhk_rent'].mean())
        df['dist_from_city'] = df['dist_from_city'].fillna(df['dist_from_city'].mean())
        
        # Enhanced size-based features
        df['log_total_sqft'] = np.log1p(df['total_sqft'])
        df['total_sqft_squared'] = df['total_sqft'] ** 2
        df['total_sqft_per_bath'] = df['total_sqft'] / (df['bath'] + 1)
        df['total_sqft_per_balcony'] = df['total_sqft'] / (df['balcony'] + 1)
        
        # Enhanced location features
        if is_train:
            self.location_stats = df.groupby('location').agg({
                'price': ['mean', 'median', 'std', 'count'],
                'total_sqft': ['mean', 'median']
            })
        
        # Add location features
        for stat in ['mean', 'median']:
            df[f'location_price_{stat}'] = df['location'].map(self.location_stats['price'][stat])
            df[f'location_sqft_{stat}'] = df['location'].map(self.location_stats['total_sqft'][stat])
        
        # Location price variance
        df['location_price_std'] = df['location'].map(self.location_stats['price']['std'])
        
        # Fill missing location stats with means
        location_means = {col: self.location_stats[col[0]][col[1]].mean() 
                         for col in self.location_stats.columns}
        df = df.fillna(location_means)
        
        # Price per sqft features
        df['price_per_sqft_loc_mean'] = df['location_price_mean'] / df['location_sqft_mean']
        df['price_per_sqft_loc_median'] = df['location_price_median'] / df['location_sqft_median']
        
        # Distance-based features
        df['price_to_dist_ratio'] = df['location_price_mean'] / (df['dist_from_city'] + 1)
        
        return df

    def prepare_features(self, df, is_train=True):
        if self.features is None:
            self.features = [
                'total_sqft', 'log_total_sqft', 'total_sqft_squared',
                'total_sqft_per_bath', 'total_sqft_per_balcony',
                'bath', 'size', 'dist_from_city',
                'location_price_mean', 'location_price_median',
                'location_sqft_mean', 'location_sqft_median',
                'price_per_sqft_loc_mean', 'price_per_sqft_loc_median',
                'price_to_dist_ratio', 'location_price_std'
            ]
            
        X = df[self.features]
        
        if is_train:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=self.features)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=self.features)
        
        return X
    
    
    def train_model(self):
        # Preprocess data
        self.train = self.preprocess_data(self.train, is_train=True)
        self.train = self.engineer_features(self.train, is_train=True)
        
        X = self.prepare_features(self.train, is_train=True)
        y = self.train['price']
        
        # Cross validation setup
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse_scores = []
        
        # Train with cross validation
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.03,
                max_depth=6,
                min_child_weight=2,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.5,
                reg_lambda=0.5,
                random_state=42
            )
            
            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            rmse_scores.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.2f}")
        
        print(f"\nMean RMSE: {np.mean(rmse_scores):.2f} (+/- {np.std(rmse_scores):.2f})")
        
        # Train final model on full data
        self.model = xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.03,
            max_depth=6,
            min_child_weight=2,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.5,
            reg_lambda=0.5,
            random_state=42
        )
        
        self.model.fit(X, y)
        
        # Feature importance
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 10 Important Features:")
        print(importance_df.head(10))
        
    def predict(self):
        self.test = self.preprocess_data(self.test, is_train=False)
        self.test = self.engineer_features(self.test, is_train=False)
        X_test = self.prepare_features(self.test, is_train=False)
        
        predictions = self.model.predict(X_test)
        
        submission = pd.DataFrame({
            'ID': range(len(predictions)),
            'Price': predictions
        })
        submission.to_csv('submission.csv', index=False)

if __name__ == "__main__":
    predictor = HousePricePredictor()
    predictor.load_data()
    predictor.train_model()
    predictor.predict()

Fold 1 RMSE: 51.66
Fold 2 RMSE: 46.91
Fold 3 RMSE: 64.08
Fold 4 RMSE: 53.51
Fold 5 RMSE: 45.60

Mean RMSE: 52.35 (+/- 6.55)

Top 10 Important Features:
                      feature  importance
0                  total_sqft    0.198666
1              log_total_sqft    0.169200
12    price_per_sqft_loc_mean    0.155214
5                        bath    0.116582
9       location_price_median    0.054946
8         location_price_mean    0.052585
15         location_price_std    0.041374
14        price_to_dist_ratio    0.040576
6                        size    0.035747
13  price_per_sqft_loc_median    0.030530


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
import re

class HousePricePredictor:
    def __init__(self):
        self.le = LabelEncoder()
        self.scaler = RobustScaler()
        self.model = None
        self.location_stats = None
        self.features = None
        
    def load_data(self):
        print("Loading datasets...")
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')
        self.rent_data = pd.read_csv('avg_rent.csv')
        self.dist_data = pd.read_csv('dist_from_city_centre.csv')
        print("Datasets loaded successfully!")
        
    def clean_size(self, size):
        if pd.isna(size):
            return np.nan
        nums = re.findall(r'\d+', str(size))
        if nums:
            return float(nums[0])
        return np.nan
    
    def clean_total_sqft(self, sqft):
        if pd.isna(sqft):
            return np.nan
        try:
            if '-' in str(sqft):
                nums = [float(x.strip()) for x in sqft.split('-')]
                return sum(nums)/len(nums)
            return float(sqft)
        except:
            return np.nan
            
    def remove_outliers(self, df):
        print("Removing outliers...")
        df = df.copy()
        for column in ['total_sqft', 'bath', 'price']:
            if column in df.columns:
                Q1 = df[column].quantile(0.25)
                Q3 = df[column].quantile(0.75)
                IQR = Q3 - Q1
                df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
        return df

    def preprocess_data(self, df, is_train=True):
        print("Preprocessing data...")
        df = df.copy()
        
        # Clean size and sqft
        df['size'] = df['size'].apply(self.clean_size)
        df['total_sqft'] = df['total_sqft'].apply(self.clean_total_sqft)
        
        # Handle missing values
        df['size'] = df['size'].fillna(df['size'].median())
        df['bath'] = df['bath'].fillna(df['bath'].median())
        df['balcony'] = df['balcony'].fillna(df['balcony'].median())
        
        # Transform area_type
        if is_train:
            df['area_type'] = self.le.fit_transform(df['area_type'])
        else:
            df['area_type'] = self.le.transform(df['area_type'])
        
        df['has_society'] = df['society'].notna().astype(int)
        
        return df
    
    def engineer_features(self, df, is_train=True):
        df = df.copy()
        
        # Merge additional data
        df = df.merge(self.rent_data, on='location', how='left')
        df = df.merge(self.dist_data, on='location', how='left')
        
        # Handle missing values for numeric columns first
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        # Area features
        df['log_total_sqft'] = np.log1p(df['total_sqft'])
        df['total_sqft_squared'] = df['total_sqft'] ** 2
        df['sqft_per_bedroom'] = df['total_sqft']/df['size']
        
        # Bath features
        df['bath_ratio'] = df['bath']/df['size']
        df['bath_per_sqft'] = df['bath']*1000/df['total_sqft']
        
        # Location features
        if is_train:
            self.location_stats = df.groupby('location').agg({
                'price': ['mean', 'median', 'std'],
                'total_sqft': ['mean'],
                'bath': ['mean']
            })
        
        # Add location features
        df['location_price_mean'] = df['location'].map(self.location_stats['price']['mean'])
        df['location_price_median'] = df['location'].map(self.location_stats['price']['median'])
        df['location_sqft_mean'] = df['location'].map(self.location_stats['total_sqft']['mean'])
        
        # Fill missing location features with global means
        location_features = ['location_price_mean', 'location_price_median', 'location_sqft_mean']
        for col in location_features:
            df[col] = df[col].fillna(df[col].mean())
        
        # Distance features
        df['log_dist_from_city'] = np.log1p(df['dist_from_city'])
        df['price_to_dist_ratio'] = df['location_price_mean'] / (df['dist_from_city'] + 1)
        
        # Fill any remaining numeric columns
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        return df

    def prepare_features(self, df, is_train=True):
        print("Preparing features for modeling...")
        if self.features is None:
            self.features = [
                'total_sqft', 'log_total_sqft', 'total_sqft_squared',
                'bath', 'size', 'dist_from_city',
                'location_price_mean', 'location_price_median',
                'price_per_sqft', 'price_to_dist_ratio',
                'total_sqft_per_bath', 'sqft_per_room'
            ]
            
        X = df[self.features]
        
        if is_train:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=self.features)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=self.features)
        
        return X
    
    def train_model(self):
        print("\nStarting model training...")
        
        # Preprocess data
        self.train = self.preprocess_data(self.train, is_train=True)
        self.train = self.remove_outliers(self.train)
        self.train = self.engineer_features(self.train, is_train=True)
        
        # Prepare features
        X = self.prepare_features(self.train, is_train=True)
        y = self.train['price']
        
        # Cross validation
        print("\nPerforming cross-validation...")
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.01,
                max_depth=6,
                min_child_weight=3,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=1,
                reg_lambda=1,
                random_state=42
            )
            
            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            rmse_scores.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.2f}")
        
        print(f"\nMean RMSE: {np.mean(rmse_scores):.2f} (+/- {np.std(rmse_scores):.2f})")
        
        # Train final model on all data
        print("\nTraining final model on full dataset...")
        self.model = xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.01,
            max_depth=6,
            min_child_weight=3,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=1,
            reg_lambda=1,
            random_state=42
        )
        
        self.model.fit(X, y)
        
        # Feature importance
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 10 Important Features:")
        print(importance_df.head(10))
        
    def predict(self):
        print("\nGenerating predictions...")
        # Preprocess test data
        self.test = self.preprocess_data(self.test, is_train=False)
        self.test = self.engineer_features(self.test, is_train=False)
        X_test = self.prepare_features(self.test, is_train=False)
        
        # Make predictions
        predictions = self.model.predict(X_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'ID': range(len(predictions)),
            'Price': predictions
        })
        submission.to_csv('submission.csv', index=False)
        print("\nPredictions saved to 'submission.csv'")

if __name__ == "__main__":
    try:
        print("Starting House Price Prediction...")
        predictor = HousePricePredictor()
        predictor.load_data()
        predictor.train_model()
        predictor.predict()
        print("\nProcess completed successfully!")
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")

Starting House Price Prediction...
Loading datasets...
Datasets loaded successfully!

Starting model training...
Preprocessing data...
Removing outliers...
Preparing features for modeling...

Error occurred: "['price_per_sqft', 'total_sqft_per_bath', 'sqft_per_room'] not in index"


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import re

class HousePricePredictor:
    def __init__(self):
        self.le = LabelEncoder()
        self.scaler = StandardScaler()
        self.model = None
        self.location_stats = None
        self.features = None
        
    def load_data(self):
        """Load all required datasets"""
        try:
            print("Loading datasets...")
            self.train = pd.read_csv('train.csv')
            self.test = pd.read_csv('test.csv')
            self.rent_data = pd.read_csv('avg_rent.csv')
            self.dist_data = pd.read_csv('dist_from_city_centre.csv')
            print("Data loading completed successfully.")
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise
            
    def clean_size(self, size):
        if pd.isna(size):
            return np.nan
        nums = re.findall(r'\d+', str(size))
        if nums:
            return float(nums[0])
        return np.nan
    
    def clean_total_sqft(self, sqft):
        if pd.isna(sqft):
            return np.nan
        try:
            if '-' in str(sqft):
                nums = [float(x.strip()) for x in sqft.split('-')]
                return sum(nums)/len(nums)
            return float(sqft)
        except:
            return np.nan

    def remove_outliers(self, df):
        df = df.copy()
        
        # Price per square foot outliers
        df['price_per_sqft'] = df['price']*100000/df['total_sqft']
        
        for location in df['location'].unique():
            location_df = df[df['location'] == location]
            if len(location_df) > 10:
                mean = location_df['price_per_sqft'].mean()
                std = location_df['price_per_sqft'].std()
                df = df[~((df['location'] == location) & 
                         (df['price_per_sqft'] > mean + 3*std))]
        
        # Bathroom outliers
        df = df[df['bath'] <= df['size'] + 2]
        
        # Area per bedroom outliers
        df['sqft_per_bedroom'] = df['total_sqft']/df['size']
        df = df[(df['sqft_per_bedroom'] >= 300) & 
                (df['sqft_per_bedroom'] <= 3000)]
        
        return df

    def preprocess_data(self, df, is_train=True):
        df = df.copy()
        
        # Clean size and sqft
        df['size'] = df['size'].apply(self.clean_size)
        df['total_sqft'] = df['total_sqft'].apply(self.clean_total_sqft)
        
        # Handle missing values
        df['size'] = df['size'].fillna(df['size'].median())
        df['bath'] = df['bath'].fillna(df['bath'].median())
        df['balcony'] = df['balcony'].fillna(df['balcony'].median())
        
        if is_train:
            df = self.remove_outliers(df)
        
        # Transform area_type
        if is_train:
            df['area_type'] = self.le.fit_transform(df['area_type'])
        else:
            df['area_type'] = self.le.transform(df['area_type'])
        
        return df
    
    def engineer_features(self, df, is_train=True):
        df = df.copy()
        
        # Merge additional data
        df = df.merge(self.rent_data, on='location', how='left')
        df = df.merge(self.dist_data, on='location', how='left')
        
        # Handle missing values for numeric columns first
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        # Area features
        df['log_total_sqft'] = np.log1p(df['total_sqft'])
        df['total_sqft_squared'] = df['total_sqft'] ** 2
        df['sqft_per_bedroom'] = df['total_sqft']/df['size']
        
        # Bath features
        df['bath_ratio'] = df['bath']/df['size']
        df['bath_per_sqft'] = df['bath']*1000/df['total_sqft']
        
        # Location features
        if is_train:
            self.location_stats = df.groupby('location').agg({
                'price': ['mean', 'median', 'std'],
                'total_sqft': ['mean'],
                'bath': ['mean']
            })
        
        # Add location features
        df['location_price_mean'] = df['location'].map(self.location_stats['price']['mean'])
        df['location_price_median'] = df['location'].map(self.location_stats['price']['median'])
        df['location_sqft_mean'] = df['location'].map(self.location_stats['total_sqft']['mean'])
        
        # Fill missing location features with global means
        location_features = ['location_price_mean', 'location_price_median', 'location_sqft_mean']
        for col in location_features:
            df[col] = df[col].fillna(df[col].mean())
        
        # Distance features
        df['log_dist_from_city'] = np.log1p(df['dist_from_city'])
        df['price_to_dist_ratio'] = df['location_price_mean'] / (df['dist_from_city'] + 1)
        
        # Fill any remaining numeric columns
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        return df

    def prepare_features(self, df, is_train=True):
        if self.features is None:
            self.features = [
                'total_sqft', 'log_total_sqft', 'sqft_per_bedroom',
                'bath', 'bath_ratio', 'bath_per_sqft',
                'size', 'log_dist_from_city',
                'location_price_mean', 'location_sqft_mean',
                'price_to_dist_ratio', 'area_type',
                'avg_2bhk_rent'
            ]
            
        X = df[self.features]
        
        if is_train:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=self.features)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=self.features)
        
        return X
    
    def train_model(self):
        print("Starting model training...")
        self.train = self.preprocess_data(self.train, is_train=True)
        self.train = self.engineer_features(self.train, is_train=True)
        
        X = self.prepare_features(self.train, is_train=True)
        y = self.train['price']
        
        # Cross validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                n_estimators=1000,
                learning_rate=0.01,
                max_depth=7,
                min_child_weight=3,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=0.75,
                reg_lambda=0.75,
                random_state=42
            )
            
            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            rmse_scores.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.2f}")
        
        print(f"\nMean RMSE: {np.mean(rmse_scores):.2f} (+/- {np.std(rmse_scores):.2f})")
        
        # Train final model
        self.model = xgb.XGBRegressor(
            n_estimators=1200,
            learning_rate=0.01,
            max_depth=7,
            min_child_weight=3,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.75,
            reg_lambda=0.75,
            random_state=42
        )
        
        self.model.fit(X, y)
        
    def predict(self):
        print("Generating predictions...")
        self.test = self.preprocess_data(self.test, is_train=False)
        self.test = self.engineer_features(self.test, is_train=False)
        X_test = self.prepare_features(self.test, is_train=False)
        
        predictions = self.model.predict(X_test)
        
        submission = pd.DataFrame({
            'ID': range(len(predictions)),
            'Price': predictions
        })
        submission.to_csv('submission.csv', index=False)
        print("Predictions saved to 'submission.csv'")

if __name__ == "__main__":
    predictor = HousePricePredictor()
    predictor.load_data()
    predictor.train_model()
    predictor.predict()

Loading datasets...
Data loading completed successfully.
Starting model training...
Fold 1 RMSE: 43.87
Fold 2 RMSE: 53.35
Fold 3 RMSE: 54.94
Fold 4 RMSE: 50.86
Fold 5 RMSE: 47.59

Mean RMSE: 50.12 (+/- 3.99)
Generating predictions...
Predictions saved to 'submission.csv'


In [None]:


def engineer_features(self, df, is_train=True):
    df = df.copy()
    
    # Merge additional data
    df = df.merge(self.rent_data, on='location', how='left')
    df = df.merge(self.dist_data, on='location', how='left')
    
    # Handle missing numerics
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    # Enhanced area features
    df['log_total_sqft'] = np.log1p(df['total_sqft'])
    df['total_sqft_squared'] = df['total_sqft'] ** 2
    df['sqft_per_bedroom'] = df['total_sqft']/df['size']
    df['sqft_per_bath'] = df['total_sqft']/df['bath']
    
    # Price per sqft features (for training)
    if 'price' in df.columns:
        df['price_per_sqft'] = df['price']*100000/df['total_sqft']
    
    # Enhanced bath features
    df['bath_ratio'] = df['bath']/df['size']
    df['bath_per_sqft'] = df['bath']*1000/df['total_sqft']
    df['extra_bath'] = df['bath'] - df['size']  # Extra bathrooms beyond bedrooms
    
    # Location clustering
    if is_train:
        self.location_stats = df.groupby('location').agg({
            'price': ['count', 'mean', 'median', 'std'],
            'total_sqft': ['mean', 'std'],
            'bath': ['mean'],
            'size': ['mean'],
            'price_per_sqft': ['mean', 'std']
        })
        
        # Create location clusters based on price_per_sqft
        self.location_clusters = pd.qcut(
            self.location_stats['price_per_sqft']['mean'], 
            q=10, 
            labels=['cluster_'+str(i) for i in range(10)]
        )
    
    # Add location features with smoothing
    for stat in ['mean', 'median']:
        df[f'location_price_{stat}'] = df['location'].map(self.location_stats['price'][stat])
    
    df['location_price_std'] = df['location'].map(self.location_stats['price']['std'])
    df['location_density'] = df['location'].map(self.location_stats['price']['count'])
    
    # Location cluster features
    df['location_cluster'] = df['location'].map(self.location_clusters)
    df['cluster_encoded'] = self.le.fit_transform(df['location_cluster']) if is_train else \
                           self.le.transform(df['location_cluster'])
    
    # Enhanced distance features
    df['log_dist_from_city'] = np.log1p(df['dist_from_city'])
    df['dist_squared'] = df['dist_from_city'] ** 2
    df['price_to_dist_ratio'] = df['location_price_mean'] / (df['dist_from_city'] + 1)
    
    # Rent features
    df['rent_ratio'] = df['avg_2bhk_rent'] / df['total_sqft']
    df['rent_to_price'] = df['avg_2bhk_rent'] / df['location_price_mean']
    
    # Interaction features
    df['sqft_dist'] = df['log_total_sqft'] * df['log_dist_from_city']
    df['sqft_bath'] = df['log_total_sqft'] * df['bath_ratio']
    df['sqft_cluster'] = df['log_total_sqft'] * df['cluster_encoded']
    
    # Fill missing values
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    
    return df

def train_model(self):

    model_params = {
    'n_estimators': 2000,
    'learning_rate': 0.005,
    'max_depth': 8,
    'min_child_weight': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.5,
    'reg_lambda': 0.5,
    'random_state': 42,
    'early_stopping_rounds': 100,
    'gamma': 0.1
}

    # Use k-fold cross validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)  # Increased folds

    # Train multiple models
    models = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = xgb.XGBRegressor(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        models.append(model)
        val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        print(f"Fold {fold} RMSE: {rmse:.2f}")

    # Use model averaging for final predictions
    self.models = models
        # Use k-fold cross validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)  # Increased folds
    
    # Train multiple models
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = xgb.XGBRegressor(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        models.append(model)
        val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        print(f"Fold {fold} RMSE: {rmse:.2f}")
    
    # Use model averaging for final predictions
    self.models = models

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import re

class HousePricePredictor:
    def __init__(self):
        self.le = LabelEncoder()
        self.scaler = StandardScaler()
        self.models = None
        self.location_stats = None
        self.features = None
        self.poly = PolynomialFeatures(degree=2, include_bias=False)
        
    def load_data(self):
        print("Loading datasets...")
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')
        self.rent_data = pd.read_csv('avg_rent.csv')
        self.dist_data = pd.read_csv('dist_from_city_centre.csv')
        
    def clean_size(self, size):
        if pd.isna(size):
            return np.nan
        nums = re.findall(r'\d+', str(size))
        if nums:
            return float(nums[0])
        return np.nan
    
    def clean_total_sqft(self, sqft):
        if pd.isna(sqft):
            return np.nan
        try:
            if '-' in str(sqft):
                nums = [float(x.strip()) for x in sqft.split('-')]
                return sum(nums)/len(nums)
            return float(sqft)
        except:
            return np.nan

    def create_location_features(self, df):
        # Create location clusters
        location_price_map = df.groupby('location')['price'].mean().to_dict() if 'price' in df.columns else self.location_price_map
        df['location_price'] = df['location'].map(location_price_map)
        
        # Location clustering
        if not hasattr(self, 'kmeans'):
            loc_features = df.groupby('location').agg({
                'total_sqft': 'mean',
                'bath': 'mean',
                'dist_from_city': 'mean',
                'avg_2bhk_rent': 'mean'
            }).fillna(method='ffill')
            
            self.kmeans = KMeans(n_clusters=5, random_state=42)
            self.kmeans.fit(loc_features)
            self.location_clusters = {loc: cluster for loc, cluster in zip(loc_features.index, self.kmeans.labels_)}
            
        df['location_cluster'] = df['location'].map(self.location_clusters)
        
        return df

    def engineer_features(self, df, is_train=True):
        df = df.copy()
        
        # Merge additional data
        df = df.merge(self.rent_data, on='location', how='left')
        df = df.merge(self.dist_data, on='location', how='left')
        
        # Handle missing values
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        
        # Basic features
        df['log_total_sqft'] = np.log1p(df['total_sqft'])
        df['total_sqft_squared'] = df['total_sqft'] ** 2
        df['bath_ratio'] = df['bath'] / df['size']
        df['bath_per_sqft'] = df['bath'] * 1000 / df['total_sqft']
        df['extra_bath'] = df['bath'] - df['size']
        df['sqft_per_room'] = df['total_sqft'] / df['size']
        
        # Price features for training
        if is_train:
            df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']
            self.price_stats = df.groupby('location')['price'].agg(['mean', 'median', 'std']).fillna(method='ffill')
            self.location_price_map = df.groupby('location')['price'].mean().to_dict()
            
        # Location features
        df = self.create_location_features(df)
        
        # Distance features
        df['log_dist'] = np.log1p(df['dist_from_city'])
        df['dist_squared'] = df['dist_from_city'] ** 2
        df['dist_per_sqft'] = df['dist_from_city'] / df['total_sqft']
        
        # Rent features
        df['log_rent'] = np.log1p(df['avg_2bhk_rent'])
        df['rent_ratio'] = df['avg_2bhk_rent'] / df['total_sqft']
        df['rent_per_room'] = df['avg_2bhk_rent'] / df['size']
        
        # Interaction features
        df['sqft_bath'] = df['log_total_sqft'] * df['bath_ratio']
        df['sqft_dist'] = df['log_total_sqft'] * df['log_dist']
        df['bath_dist'] = df['bath_ratio'] * df['log_dist']
        
        # Ready to move feature
        df['ready_to_move'] = (df['availability'] == 'Ready To Move').astype(int)
        
        return df
        
    def prepare_features(self, df, is_train=True):
        if self.features is None:
            self.features = [
                'total_sqft', 'log_total_sqft', 'total_sqft_squared',
                'bath', 'bath_ratio', 'bath_per_sqft', 'extra_bath',
                'size', 'sqft_per_room', 'location_cluster',
                'log_dist', 'dist_squared', 'dist_per_sqft',
                'log_rent', 'rent_ratio', 'rent_per_room',
                'sqft_bath', 'sqft_dist', 'bath_dist',
                'ready_to_move', 'area_type'
            ]
        
        X = df[self.features]
        
        # Add polynomial features for key numeric columns
        numeric_features = ['total_sqft', 'bath_ratio', 'log_dist']
        poly_features = self.poly.fit_transform(X[numeric_features]) if is_train else \
                       self.poly.transform(X[numeric_features])
        poly_df = pd.DataFrame(poly_features, columns=[f'poly_{i}' for i in range(poly_features.shape[1])])
        
        X = pd.concat([X, poly_df], axis=1)
        
        if is_train:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=X.columns)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=X.columns)
            
        return X
    def preprocess_data(self, df, is_train=True):
        print("Preprocessing data...")
        df = df.copy()
        
        # Clean size and sqft
        df['size'] = df['size'].apply(self.clean_size)
        df['total_sqft'] = df['total_sqft'].apply(self.clean_total_sqft)
        
        # Remove outliers for training data
        if is_train:
            # Price per sqft based outlier removal
            df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']
            for location in df['location'].unique():
                location_df = df[df['location'] == location]
                if len(location_df) >= 10:  # Only for locations with enough samples
                    mean = location_df['price_per_sqft'].mean()
                    std = location_df['price_per_sqft'].std()
                    df = df[~((df['location'] == location) & 
                            (df['price_per_sqft'] > mean + 3 * std))]
            
            # Remove unrealistic bath counts
            df = df[df['bath'] <= df['size'] + 2]
            
            # Remove extreme sqft per bedroom properties
            df['sqft_per_bedroom'] = df['total_sqft'] / df['size']
            df = df[(df['sqft_per_bedroom'] >= 300) & 
                    (df['sqft_per_bedroom'] <= 3000)]
        
        # Handle missing values
        df['size'] = df['size'].fillna(df['size'].median())
        df['bath'] = df['bath'].fillna(df['bath'].median())
        df['balcony'] = df['balcony'].fillna(df['balcony'].median())
        
        # Transform area_type
        if is_train:
            df['area_type'] = self.le.fit_transform(df['area_type'])
        else:
            df['area_type'] = self.le.transform(df['area_type'])
        
        return df
    def train_model(self):
        print("Starting model training...")
        self.train = self.preprocess_data(self.train, is_train=True)
        self.train = self.engineer_features(self.train, is_train=True)
        
        X = self.prepare_features(self.train, is_train=True)
        y = self.train['price']
        
        xgb_models = []
        lgb_models = []
        cat_models = []
        
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=3000,
                learning_rate=0.003,
                max_depth=9,
                min_child_weight=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.6,
                reg_lambda=0.6,
                random_state=42
            )
            
            # LightGBM
            lgb_model = lgb.LGBMRegressor(
                n_estimators=3000,
                learning_rate=0.003,
                num_leaves=40,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.6,
                reg_lambda=0.6,
                random_state=42
            )
            
            # CatBoost
            cat_model = cb.CatBoostRegressor(
                iterations=3000,
                learning_rate=0.003,
                depth=9,
                l2_leaf_reg=5,
                random_state=42,
                verbose=False
            )
            
            # Train models
            xgb_model.fit(X_train, y_train)
            lgb_model.fit(X_train, y_train)
            cat_model.fit(X_train, y_train)
            
            xgb_models.append(xgb_model)
            lgb_models.append(lgb_model)
            cat_models.append(cat_model)
            
            # Blend predictions
            xgb_pred = xgb_model.predict(X_val)
            lgb_pred = lgb_model.predict(X_val)
            cat_pred = cat_model.predict(X_val)
            
            blend_pred = (0.4 * xgb_pred + 0.3 * lgb_pred + 0.3 * cat_pred)
            rmse = np.sqrt(mean_squared_error(y_val, blend_pred))
            print(f"Fold {fold} RMSE: {rmse:.2f}")
        
        self.models = {
            'xgb': xgb_models,
            'lgb': lgb_models,
            'cat': cat_models
        }
     
    def predict(self):
        print("Generating predictions...")
        self.test = self.preprocess_data(self.test, is_train=False)
        self.test = self.engineer_features(self.test, is_train=False)
        X_test = self.prepare_features(self.test, is_train=False)
        
        # Get predictions from all models
        xgb_preds = np.mean([model.predict(X_test) for model in self.models['xgb']], axis=0)
        lgb_preds = np.mean([model.predict(X_test) for model in self.models['lgb']], axis=0)
        cat_preds = np.mean([model.predict(X_test) for model in self.models['cat']], axis=0)
        
        # Weighted blend
        final_preds = (0.4 * xgb_preds + 0.3 * lgb_preds + 0.3 * cat_preds)
        
        # Create submission
        submission = pd.DataFrame({
            'ID': range(len(final_preds)),
            'Price': final_preds
        })
        submission.to_csv('submission.csv', index=False)
        print("Predictions saved to 'submission.csv'")

if __name__ == "__main__":
    predictor = HousePricePredictor()
    predictor.load_data()
    predictor.train_model()
    predictor.predict()

Loading datasets...
Starting model training...
Preprocessing data...


  self.price_stats = df.groupby('location')['price'].agg(['mean', 'median', 'std']).fillna(method='ffill')
  }).fillna(method='ffill')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4914
[LightGBM] [Info] Number of data points in the train set: 7887, number of used features: 30
[LightGBM] [Info] Start training from score 108.600595
Fold 1 RMSE: 68.55
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4931
[LightGBM] [Info] Number of data points in the train set: 7887, number of used features: 30
[LightGBM] [Info] Start training from score 106.527115
Fold 2 RMSE: 75.82
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000828 seconds.
You can set `force_col_wise=true` to remove 