In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.impute import SimpleImputer
import re

In [3]:
# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
avg_rent = pd.read_csv('./data/avg_rent.csv')
dist_city = pd.read_csv('./data/dist_from_city_centre.csv')

print(train.head(5))

   ID             area_type   availability                  location  \
0   0  Super built-up  Area         19-Dec  Electronic City Phase II   
1   1            Plot  Area  Ready To Move          Chikka Tirupathi   
2   2        Built-up  Area  Ready To Move               Uttarahalli   
3   3  Super built-up  Area  Ready To Move        Lingadheeranahalli   
4   4  Super built-up  Area  Ready To Move                  Kothanur   

        size  society total_sqft  bath  balcony   price  
0      2 BHK  Coomee        1056   2.0      1.0   39.07  
1  4 Bedroom  Theanmp       2600   5.0      3.0  120.00  
2      3 BHK      NaN       1440   2.0      3.0   62.00  
3      3 BHK  Soiewre       1521   3.0      1.0   95.00  
4      2 BHK      NaN       1200   2.0      1.0   51.00  


In [4]:
# Merge external data
train = train.merge(avg_rent, on='location', how='left')
train = train.merge(dist_city, on='location', how='left')

test = test.merge(avg_rent, on='location', how='left')
test = test.merge(dist_city, on='location', how='left')


In [5]:

# Handle missing values
train.fillna({'bath': train['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': train['dist_from_city'].median()}, inplace=True)
test.fillna({'bath': test['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': test['dist_from_city'].median()}, inplace=True)


In [6]:

# Function to convert 'total_sqft' to a numeric value
def convert_sqft_to_num(sqft):
    try:
        # Handle ranges (e.g., "2100 - 2850")
        if '-' in sqft:
            sqft_range = sqft.split('-')
            return (float(sqft_range[0]) + float(sqft_range[1])) / 2  # Average of the range
        
        # Ignore strings containing keywords (e.g., "Super built-up Area")
        if any(keyword in sqft.lower() for keyword in ['built-up', 'super', 'carpet', 'area']):
            return np.nan
        
        # Handle purely numeric values
        if sqft.replace('.', '', 1).isdigit():
            return float(sqft)
        
        # Extract numeric values from text (e.g., "2100 sqft")
        sqft_numeric = re.findall(r'\d+\.?\d*', sqft)
        if sqft_numeric:
            return float(sqft_numeric[0])  # Take the first numeric part
        
        return np.nan  # Return NaN for unhandled cases
    except Exception as e:
        print(f"Error converting '{sqft}': {e}")
        return np.nan  # Fallback for unexpected issues

In [7]:
# Apply the updated function to total_sqft
train['total_sqft'] = train['total_sqft'].astype(str).apply(convert_sqft_to_num)
test['total_sqft'] = test['total_sqft'].astype(str).apply(convert_sqft_to_num)


In [8]:

# Handle missing or invalid values (if any rows have NaN after conversion)
train['total_sqft'].fillna(train['total_sqft'].median(), inplace=True)
test['total_sqft'].fillna(test['total_sqft'].median(), inplace=True)


In [9]:

# Feature engineering for train data
train['num_bedrooms'] = train['size'].str.extract('(\d+)').astype(float)
train['price_per_sqft'] = train['price'] / train['total_sqft']
train['rent_to_price_ratio'] = train['avg_2bhk_rent'] / train['price']


In [10]:

# Feature engineering for test data (no price-related features)
test['num_bedrooms'] = test['size'].str.extract('(\d+)').astype(float)
test['price_per_sqft'] = test['total_sqft']  # Placeholder for testing, since we don't have price
test['rent_to_price_ratio'] = test['avg_2bhk_rent']  # Placeholder for testing, since we don't have price


In [11]:

# Remove any remaining invalid entries in the dataset
train.dropna(inplace=True)

In [12]:

# Split data
X = train.drop(columns=['price','ID'])
y = train['price']


In [13]:

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a pipeline with preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [14]:
# #Define model
# model = RandomForestRegressor(random_state=42)

# # Hyperparameter tuning using GridSearchCV
# param_grid = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', grid_search)
# ])


In [15]:

# Split data for training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse}")
print(f"Validation MAE: {mae}")
print(f"Validation R²: {r2}")



Validation RMSE: 27.181167018514568
Validation MAE: 2.8293075562700962
Validation R²: 0.9416365875706646


In [20]:
# Handle missing values in the test set (Impute missing values)
# Create an imputer for numerical columns
num_imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent' based on your preference
cat_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns, we can impute the most frequent value

# Apply the imputers to the test set before prediction
numerical_cols = test.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = test.select_dtypes(include=['object']).columns

# Impute missing numerical data
test[numerical_cols] = num_imputer.fit_transform(test[numerical_cols])

# Impute missing categorical data
test[categorical_cols] = cat_imputer.fit_transform(test[categorical_cols])

# Ensure 'ID' column is dropped before prediction
test_processed = test.drop(columns=['ID'])

# Predict on test data
test_pred = pipeline.predict(test_processed)

# Save predictions with ID as integer
test['price'] = test_pred

# Convert 'ID' column to integer type before saving
test['ID'] = test['ID'].astype(int)

# Save the predictions to 'submission.csv'
test[['ID','price']].to_csv('submission.csv', index=False)

print("Predictions have been saved to 'submission.csv'")

Predictions have been saved to 'submission.csv'
