In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import re

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
avg_rent = pd.read_csv('./data/avg_rent.csv')
dist_city = pd.read_csv('./data/dist_from_city_centre.csv')

# Merge external data
train = train.merge(avg_rent, on='location', how='left')
train = train.merge(dist_city, on='location', how='left')
test = test.merge(avg_rent, on='location', how='left')
test = test.merge(dist_city, on='location', how='left')

# Handle missing values
train.fillna({'bath': train['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': train['dist_from_city'].median()}, inplace=True)
test.fillna({'bath': test['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': test['dist_from_city'].median()}, inplace=True)

# Function to convert 'total_sqft' to a numeric value
def convert_sqft_to_num(sqft):
    try:
        if '-' in sqft:
            sqft_range = sqft.split('-')
            return (float(sqft_range[0]) + float(sqft_range[1])) / 2
        if any(keyword in sqft.lower() for keyword in ['built-up', 'super', 'carpet', 'area']):
            return np.nan
        if sqft.replace('.', '', 1).isdigit():
            return float(sqft)
        sqft_numeric = re.findall(r'\d+\.?\d*', sqft)
        if sqft_numeric:
            return float(sqft_numeric[0])
        return np.nan
    except:
        return np.nan

train['total_sqft'] = train['total_sqft'].astype(str).apply(convert_sqft_to_num)
test['total_sqft'] = test['total_sqft'].astype(str).apply(convert_sqft_to_num)
train['total_sqft'].fillna(train['total_sqft'].median(), inplace=True)
test['total_sqft'].fillna(test['total_sqft'].median(), inplace=True)

# Feature engineering
train['num_bedrooms'] = train['size'].str.extract('(\d+)').astype(float)
train['price_per_sqft'] = train['price'] / train['total_sqft']
train['rent_to_price_ratio'] = train['avg_2bhk_rent'] / train['price']
train['log_price'] = np.log1p(train['price'])

test['num_bedrooms'] = test['size'].str.extract('(\d+)').astype(float)
test['price_per_sqft'] = test['total_sqft']
test['rent_to_price_ratio'] = test['avg_2bhk_rent']

train.dropna(inplace=True)

# Split data
X = train.drop(columns=['price', 'log_price', 'ID'])
y = np.log1p(train['price'])

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define model
model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', grid_search)
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Evaluate
y_val_pred = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_val_pred)))
mae = mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred))
r2 = r2_score(np.expm1(y_val), np.expm1(y_val_pred))

print(f"Validation RMSE: {rmse}")
print(f"Validation MAE: {mae}")
print(f"Validation R²: {r2}")

# Predict on test set
test_processed = test.drop(columns=['ID'])
test_pred = np.expm1(pipeline.predict(test_processed))

test['predicted_price'] = test_pred
test['ID'] = test['ID'].astype(int)
test[['ID', 'predicted_price']].to_csv('submission.csv', index=False)

print("Predictions have been saved to 'submission.csv'")


Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
import re

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
avg_rent = pd.read_csv('./data/avg_rent.csv')
dist_city = pd.read_csv('./data/dist_from_city_centre.csv')

# Merge external data
train = train.merge(avg_rent, on='location', how='left')
train = train.merge(dist_city, on='location', how='left')

test = test.merge(avg_rent, on='location', how='left')
test = test.merge(dist_city, on='location', how='left')

# Handle missing values
train.fillna({'bath': train['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': train['dist_from_city'].median()}, inplace=True)
test.fillna({'bath': test['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': test['dist_from_city'].median()}, inplace=True)

# Function to convert 'total_sqft' to a numeric value
def convert_sqft_to_num(sqft):
    try:
        if '-' in sqft:
            sqft_range = sqft.split('-')
            return (float(sqft_range[0]) + float(sqft_range[1])) / 2
        if any(keyword in sqft.lower() for keyword in ['built-up', 'super', 'carpet', 'area']):
            return np.nan
        if sqft.replace('.', '', 1).isdigit():
            return float(sqft)
        sqft_numeric = re.findall(r'\d+\.?\d*', sqft)
        if sqft_numeric:
            return float(sqft_numeric[0])
        return np.nan
    except:
        return np.nan

# Apply the function to total_sqft
train['total_sqft'] = train['total_sqft'].astype(str).apply(convert_sqft_to_num)
test['total_sqft'] = test['total_sqft'].astype(str).apply(convert_sqft_to_num)

# Handle missing values in total_sqft
train['total_sqft'].fillna(train['total_sqft'].median(), inplace=True)
test['total_sqft'].fillna(test['total_sqft'].median(), inplace=True)

# Feature engineering for train data
train['num_bedrooms'] = train['size'].str.extract('(\d+)').astype(float)
train['price_per_sqft'] = train['price'] / train['total_sqft']
train['rent_to_price_ratio'] = train['avg_2bhk_rent'] / train['price']
train['price_log'] = np.log1p(train['price'])  # Log transformation of target variable

# Feature engineering for test data
test['num_bedrooms'] = test['size'].str.extract('(\d+)').astype(float)
test['price_per_sqft'] = test['total_sqft']  # Placeholder
test['rent_to_price_ratio'] = test['avg_2bhk_rent']  # Placeholder

# Remove invalid entries in train
train.dropna(inplace=True)

# Define features and target
X = train.drop(columns=['price', 'price_log', 'ID'])
y = train['price_log']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Hyperparameter tuning for Random Forest
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', grid_search)
])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_val_pred)))  # Reverse log transformation
mae = mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred))
r2 = r2_score(np.expm1(y_val), np.expm1(y_val_pred))

print(f"Validation RMSE: {rmse}")
print(f"Validation MAE: {mae}")
print(f"Validation R²: {r2}")

# Predict on test set
# test_processed = test.drop(columns=['ID'])
# test_pred_log = pipeline.predict(test_processed)
# test['predicted_price'] = np.expm1(test_pred_log)  # Reverse log transformation

# # Save predictions
# test[['ID', 'predicted_price']].to_csv('submission.csv', index=False)
# print("Predictions have been saved to 'submission.csv'")


KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import re

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
avg_rent = pd.read_csv('./data/avg_rent.csv')
dist_city = pd.read_csv('./data/dist_from_city_centre.csv')

# Merge external data
train = train.merge(avg_rent, on='location', how='left')
train = train.merge(dist_city, on='location', how='left')

test = test.merge(avg_rent, on='location', how='left')
test = test.merge(dist_city, on='location', how='left')

# Handle missing values
train.fillna({'bath': train['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': train['dist_from_city'].median()}, inplace=True)
test.fillna({'bath': test['bath'].median(), 'balcony': 0, 'avg_2bhk_rent': 0, 'dist_from_city': test['dist_from_city'].median()}, inplace=True)

# Function to convert 'total_sqft' to a numeric value
def convert_sqft_to_num(sqft):
    if '-' in sqft:
        sqft_range = sqft.split('-')
        return (float(sqft_range[0]) + float(sqft_range[1])) / 2
    if sqft.replace('.', '', 1).isdigit():
        return float(sqft)
    sqft_numeric = re.findall(r'\d+\.?\d*', sqft)
    return float(sqft_numeric[0]) if sqft_numeric else np.nan

train['total_sqft'] = train['total_sqft'].astype(str).apply(convert_sqft_to_num)
test['total_sqft'] = test['total_sqft'].astype(str).apply(convert_sqft_to_num)

train['total_sqft'].fillna(train['total_sqft'].median(), inplace=True)
test['total_sqft'].fillna(test['total_sqft'].median(), inplace=True)

# Feature engineering for train data
train['num_bedrooms'] = train['size'].str.extract('(\d+)').astype(float)
train['price_per_sqft'] = train['price'] / train['total_sqft']
train['rent_to_price_ratio'] = train['avg_2bhk_rent'] / train['price']

test['num_bedrooms'] = test['size'].str.extract('(\d+)').astype(float)

# Drop rows with NaN values
train.dropna(inplace=True)

# Split data
X = train.drop(columns=['price', 'ID'])
y = train['price']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Split data for training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"Optimized Validation RMSE: {rmse}")
print(f"Validation R²: {r2}")

# Predict on test data
test_processed = test.drop(columns=['ID'])
test['predicted_price'] = best_model.predict(test_processed)
test['ID'] = test['ID'].astype(int)
test[['ID', 'predicted_price']].to_csv('submission.csv', index=False)

print("Predictions have been saved to 'submission.csv'")


Fitting 3 folds for each of 81 candidates, totalling 243 fits


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
import re

# Load data
train = pd.read_csv('./data/train.csv')
avg_rent = pd.read_csv('./data/avg_rent.csv')
dist_city = pd.read_csv('./data/dist_from_city_centre.csv')

df = train.merge(avg_rent, on='location', how='left')
df = df.merge(dist_city, on='location', how='left')

# Fill missing values
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].mean())
df['area_type'] = df['area_type'].fillna(df['area_type'].mode()[0])
df['availability'] = df['availability'].fillna(df['availability'].mode()[0])

# Preprocess 'size' column
df['size'] = df['size'].str.extract('(\d+)').astype(float)
df['size'].fillna(df['size'].median(), inplace=True)

# Handle non-numeric values in 'total_sqft'
def preprocess_total_sqft(value):
    if '-' in str(value):
        values = list(map(float, value.split('-')))
        return np.mean(values)
    try:
        return float(value)
    except ValueError:
        return np.nan

df['total_sqft'] = df['total_sqft'].apply(preprocess_total_sqft)
df['total_sqft'].fillna(df['total_sqft'].median(), inplace=True)

# Fill missing values in 'avg_2bhk_rent' and 'dist_from_city'
df['avg_2bhk_rent'].fillna(df['avg_2bhk_rent'].median(), inplace=True)
df['dist_from_city'].fillna(df['dist_from_city'].median(), inplace=True)

# Add new feature: rental yield
df['rental_yield'] = (df['avg_2bhk_rent'] * 12) / df['price']

# Drop rows with missing target values
df.dropna(subset=['price'], inplace=True)

# Split features and target
X = df.drop(columns=['price', 'society', 'availability'])
y = df['price']

# Identify numerical and categorical features
numerical_features = ['total_sqft', 'size', 'bath', 'balcony', 'avg_2bhk_rent', 'dist_from_city', 'rental_yield']
categorical_features = ['area_type', 'location']

# Preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse}")
print(f"Validation MAE: {mae}")
print(f"Validation R²: {r2}")
print(f"Best Parameters: {grid_search.best_params_}")

# Handle missing values in the test set (Impute missing values)
# Create an imputer for numerical columns
num_imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent' based on your preference
cat_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns, we can impute the most frequent value

# Apply the imputers to the test set before prediction
numerical_cols = test.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = test.select_dtypes(include=['object']).columns

# Impute missing numerical data
test[numerical_cols] = num_imputer.fit_transform(test[numerical_cols])

# Impute missing categorical data
test[categorical_cols] = cat_imputer.fit_transform(test[categorical_cols])

# Ensure 'ID' column is dropped before prediction
test_processed = test.drop(columns=['ID'])

# Predict on test data
test_pred = pipeline.predict(test_processed)

# Save predictions with ID as integer
test['price'] = test_pred

# Convert 'ID' column to integer type before saving
test['ID'] = test['ID'].astype(int)

# Save the predictions to 'submission.csv'
test[['ID','price']].to_csv('submission.csv', index=False)

print("Predictions have been saved to 'submission.csv'")``


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   9.5s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   9.9s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=  10.0s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=  19.9s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time=  10.0s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time=  10.0s
[CV] END model__max_depth=10, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=  20.5s
[CV] END model__max_depth=10, model