In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import re

class HousePricePredictor:
    def __init__(self):
        self.le = LabelEncoder()
        self.scaler = StandardScaler()
        self.model = None
        self.location_stats = None
        self.features = None
        
    def load_data(self):
        """Load all required datasets"""
        try:
            print("Loading datasets...")
            self.train = pd.read_csv('train.csv')
            self.test = pd.read_csv('test.csv')
            self.rent_data = pd.read_csv('avg_rent.csv')
            self.dist_data = pd.read_csv('dist_from_city_centre.csv')
            print("Data loading completed successfully.")
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise
            
    def clean_size(self, size):
        if pd.isna(size):
            return np.nan
        nums = re.findall(r'\d+', str(size))
        if nums:
            return float(nums[0])
        return np.nan
    
    def clean_total_sqft(self, sqft):
        if pd.isna(sqft):
            return np.nan
        try:
            if '-' in str(sqft):
                nums = [float(x.strip()) for x in sqft.split('-')]
                return sum(nums)/len(nums)
            return float(sqft)
        except:
            return np.nan

    def remove_outliers(self, df):
        df = df.copy()
        
        # Price per square foot outliers
        df['price_per_sqft'] = df['price']*100000/df['total_sqft']
        
        for location in df['location'].unique():
            location_df = df[df['location'] == location]
            if len(location_df) > 10:
                mean = location_df['price_per_sqft'].mean()
                std = location_df['price_per_sqft'].std()
                df = df[~((df['location'] == location) & 
                         (df['price_per_sqft'] > mean + 3*std))]
        
        # Bathroom outliers
        df = df[df['bath'] <= df['size'] + 2]
        
        # Area per bedroom outliers
        df['sqft_per_bedroom'] = df['total_sqft']/df['size']
        df = df[(df['sqft_per_bedroom'] >= 300) & 
                (df['sqft_per_bedroom'] <= 3000)]
        
        return df

    def preprocess_data(self, df, is_train=True):
        df = df.copy()
        
        # Clean size and sqft
        df['size'] = df['size'].apply(self.clean_size)
        df['total_sqft'] = df['total_sqft'].apply(self.clean_total_sqft)
        
        # Handle missing values
        df['size'] = df['size'].fillna(df['size'].median())
        df['bath'] = df['bath'].fillna(df['bath'].median())
        df['balcony'] = df['balcony'].fillna(df['balcony'].median())
        
        if is_train:
            df = self.remove_outliers(df)
        
        # Transform area_type
        if is_train:
            df['area_type'] = self.le.fit_transform(df['area_type'])
        else:
            df['area_type'] = self.le.transform(df['area_type'])
        
        return df
    
    def engineer_features(self, df, is_train=True):
        df = df.copy()
        
        # Merge additional data
        df = df.merge(self.rent_data, on='location', how='left')
        df = df.merge(self.dist_data, on='location', how='left')
        
        # Handle missing values for numeric columns first
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        # Area features
        df['log_total_sqft'] = np.log1p(df['total_sqft'])
        df['total_sqft_squared'] = df['total_sqft'] ** 2
        df['sqft_per_bedroom'] = df['total_sqft']/df['size']
        
        # Bath features
        df['bath_ratio'] = df['bath']/df['size']
        df['bath_per_sqft'] = df['bath']*1000/df['total_sqft']
        
        # Location features
        if is_train:
            self.location_stats = df.groupby('location').agg({
                'price': ['mean', 'median', 'std'],
                'total_sqft': ['mean'],
                'bath': ['mean']
            })
        
        # Add location features
        df['location_price_mean'] = df['location'].map(self.location_stats['price']['mean'])
        df['location_price_median'] = df['location'].map(self.location_stats['price']['median'])
        df['location_sqft_mean'] = df['location'].map(self.location_stats['total_sqft']['mean'])
        
        # Fill missing location features with global means
        location_features = ['location_price_mean', 'location_price_median', 'location_sqft_mean']
        for col in location_features:
            df[col] = df[col].fillna(df[col].mean())
        
        # Distance features
        df['log_dist_from_city'] = np.log1p(df['dist_from_city'])
        df['price_to_dist_ratio'] = df['location_price_mean'] / (df['dist_from_city'] + 1)
        
        # Fill any remaining numeric columns
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
        
        return df

    def prepare_features(self, df, is_train=True):
        if self.features is None:
            self.features = [
                'total_sqft', 'log_total_sqft', 'sqft_per_bedroom',
                'bath', 'bath_ratio', 'bath_per_sqft',
                'size', 'log_dist_from_city',
                'location_price_mean', 'location_sqft_mean',
                'price_to_dist_ratio', 'area_type',
                'avg_2bhk_rent'
            ]
            
        X = df[self.features]
        
        if is_train:
            X = pd.DataFrame(self.scaler.fit_transform(X), columns=self.features)
        else:
            X = pd.DataFrame(self.scaler.transform(X), columns=self.features)
        
        return X
    
    def train_model(self):
        print("Starting model training...")
        self.train = self.preprocess_data(self.train, is_train=True)
        self.train = self.engineer_features(self.train, is_train=True)
        
        X = self.prepare_features(self.train, is_train=True)
        y = self.train['price']
        
        # Cross validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                n_estimators=1000,
                learning_rate=0.01,
                max_depth=7,
                min_child_weight=3,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=0.75,
                reg_lambda=0.75,
                random_state=42
            )
            
            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            rmse_scores.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.2f}")
        
        print(f"\nMean RMSE: {np.mean(rmse_scores):.2f} (+/- {np.std(rmse_scores):.2f})")
        
        # Train final model
        self.model = xgb.XGBRegressor(
            n_estimators=1200,
            learning_rate=0.01,
            max_depth=7,
            min_child_weight=3,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.75,
            reg_lambda=0.75,
            random_state=42
        )
        
        self.model.fit(X, y)
        
    def predict(self):
        print("Generating predictions...")
        self.test = self.preprocess_data(self.test, is_train=False)
        self.test = self.engineer_features(self.test, is_train=False)
        X_test = self.prepare_features(self.test, is_train=False)
        
        predictions = self.model.predict(X_test)
        
        submission = pd.DataFrame({
            'ID': range(len(predictions)),
            'Price': predictions
        })
        submission.to_csv('submission.csv', index=False)
        print("Predictions saved to 'submission.csv'")

if __name__ == "__main__":
    predictor = HousePricePredictor()
    predictor.load_data()
    predictor.train_model()
    predictor.predict()

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)