In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np



df = pd.read_csv("Real Estate Data V21.csv")

def convert_price(price_str):
    if isinstance(price_str, str):
        price_str = price_str.replace("₹", "").replace(",", "").strip()
        try:
            if "Cr" in price_str:
                return float(price_str.replace("Cr", "").strip()) * 1e7
            elif "L" in price_str:
                return float(price_str.replace("L", "").strip()) * 1e5
            else:
                return float(price_str)
        except:
            return None
    return None

# Apply conversion
df["Price_num"] = df["Price"].apply(convert_price)

df_clean = df.dropna(subset=["Total_Area", "Baths", "Price_num"])

# Features and Target
X = df_clean[["Total_Area", "Baths"]]
y = df_clean["Price_num"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")


Model Performance:
MAE  : 5992917.06
RMSE : 14339325.89
R²   : 0.2788


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

def convert_price(price_str):
    if isinstance(price_str, str):
        price_str = price_str.replace("₹", "").replace(",", "").strip()
        try:
            if "Cr" in price_str:
                return float(price_str.replace("Cr", "").strip()) * 1e7
            elif "L" in price_str:
                return float(price_str.replace("L", "").strip()) * 1e5
            else:
                return float(price_str)
        except:
            return None
    return None

df["Price_num"] = df["Price"].apply(convert_price)

# Extract BHK from Property Title
df["BHK"] = df["Property Title"].str.extract(r'(\d+)\s*BHK').astype(float)

# Encode Balcony
df["Balcony_num"] = df["Balcony"].map({"Yes": 1, "No": 0})

# Drop rows with missing essential values
df_clean = df.dropna(subset=["Total_Area", "Baths", "Price_num", "BHK", "Price_per_SQFT", "Balcony_num"])

df_clean["Price_per_BHK"] = df_clean["Price_num"] / df_clean["BHK"]
df_clean["Area_per_BHK"] = df_clean["Total_Area"] / df_clean["BHK"]

X = df_clean[["Total_Area", "Baths", "BHK", "Price_per_SQFT", "Balcony_num", "Price_per_BHK", "Area_per_BHK"]]
y = df_clean["Price_num"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)


# Evaluation function
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} Performance:")
    print(f"  MAE  : {mae:,.2f} INR")
    print(f"  RMSE : {rmse:,.2f} INR")
    print(f"  R²   : {r2:.4f}\n")

# Compare models
evaluate_model("Linear Regression", y_test, y_pred_lin)


Linear Regression Performance:
  MAE  : 3,209,505.93 INR
  RMSE : 14,437,006.07 INR
  R²   : 0.4959



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Price_per_BHK"] = df_clean["Price_num"] / df_clean["BHK"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Area_per_BHK"] = df_clean["Total_Area"] / df_clean["BHK"]


# Using random forest

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor  # Using a more powerful model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- 1. Load the dataset ---
try:
    df = pd.read_csv("Real Estate Data V21.csv")
except FileNotFoundError:
    print("Error: The file 'Real Estate Data V21.csv' was not found.")
    exit()

# --- 2. Price Standardization to Lakhs ---
def standardize_price_to_lakhs(price_str):
    if isinstance(price_str, str):
        price_str = price_str.replace("₹", "").replace(",", "").strip().lower()
        try:
            if "cr" in price_str:
                return float(re.findall(r'[\d.]+', price_str)[0]) * 100
            elif "l" in price_str:
                return float(re.findall(r'[\d.]+', price_str)[0])
            else:
                return float(price_str) / 100000
        except (ValueError, IndexError):
            return np.nan
    return np.nan

df['Price_lakhs'] = df['Price'].apply(standardize_price_to_lakhs)

# --- 3. Location and City Extraction ---
def extract_city(location_str):
    if isinstance(location_str, str):
        parts = location_str.split(',')
        return parts[-1].strip() if len(parts) > 1 else 'Unknown'
    return 'Unknown'

if 'Location' in df.columns:
    df['City'] = df['Location'].apply(extract_city)
else:
    df['City'] = 'Unknown'

# --- 4. BHK Extraction ---
def clean_bhk(bhk_str):
    if isinstance(bhk_str, str):
        match = re.search(r'(\d+\.?\d*)', bhk_str)
        if match:
            return float(match.group(1))
    return np.nan

if 'BHK' in df.columns:
    df['BHK_num'] = df['BHK'].apply(clean_bhk)
else:
    df['BHK_num'] = np.nan

# --- 5. Data Cleaning ---
# Drop rows with missing values critical for the model
df_clean = df.dropna(subset=["Total_Area", "Baths", "Price_lakhs"]).copy() # Use .copy() to avoid warnings

if df_clean.empty:
    raise ValueError("DataFrame is empty after cleaning. Check the input data.")

# --- 6. Feature Engineering ---
# Create a powerful new feature: Price per Square Foot
# Note: We calculate this before log transformation
df_clean['Price_per_sqft'] = df_clean['Price_lakhs'] * 100000 / df_clean['Total_Area']

# --- 7. Log Transformations ---
# Apply log transformation to handle skewed distributions
df_clean['Total_Area_log'] = np.log1p(df_clean['Total_Area'])
df_clean['Baths_log'] = np.log1p(df_clean['Baths'])
df_clean['Price_per_sqft_log'] = np.log1p(df_clean['Price_per_sqft']) # Log transform the new feature
df_clean['Price_lakhs_log'] = np.log(df_clean['Price_lakhs'])

numerical_features = ['Total_Area_log', 'Baths_log', 'Price_per_sqft_log']
categorical_features = []

if 'BHK_num' in df_clean.columns and not df_clean['BHK_num'].isnull().all():
    df_clean['BHK_num_log'] = np.log1p(df_clean['BHK_num'])
    numerical_features.append('BHK_num_log')
if 'City' in df_clean.columns:
    categorical_features.append('City')

# --- 8. Target Encoding & Train-Test Split ---
X = df_clean[numerical_features + categorical_features]
y = df_clean['Price_lakhs_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Perform target encoding after splitting to prevent data leakage
if 'City' in categorical_features:
    city_mean_encoding = y_train.groupby(X_train['City']).mean()
    X_train['City_encoded'] = X_train['City'].map(city_mean_encoding)
    X_test['City_encoded'] = X_test['City'].map(city_mean_encoding)

    # Fill cities in the test set that weren't in the training set with the overall average
    X_test['City_encoded'].fillna(y_train.mean(), inplace=True)

    numerical_features.append('City_encoded')
    X_train = X_train[numerical_features]
    X_test = X_test[numerical_features]

# --- 9. Standard Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 10. Build, Train, and Evaluate the Improved Model ---
# Use RandomForestRegressor for better performance on complex data
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# Predictions are on the log scale
y_pred_log = model.predict(X_test_scaled)

# Inverse transform predictions to get the actual price in lakhs
y_pred = np.exp(y_pred_log)
y_test_original = np.exp(y_test)

# --- 11. Final Performance Metrics ---
mae = mean_absolute_error(y_test_original, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred))
r2 = r2_score(y_test_original, y_pred)

print("--- Improved Model Performance (Random Forest) ---")
print(f"Mean Absolute Error (MAE)  : {mae:,.2f} Lakhs")
print(f"Root Mean Squared Error (RMSE) : {rmse:,.2f} Lakhs")
print(f"R-squared (R²)               : {r2:.4f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['City_encoded'].fillna(y_train.mean(), inplace=True)


--- Improved Model Performance (Random Forest) ---
Mean Absolute Error (MAE)  : 1.94 Lakhs
Root Mean Squared Error (RMSE) : 17.93 Lakhs
R-squared (R²)               : 0.9866


# Using Linear Regression

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# --- 1. Load the dataset ---
try:
    df = pd.read_csv("Real Estate Data V21.csv")
except FileNotFoundError:
    print("Error: The file 'Real Estate Data V21.csv' was not found.")
    exit()

# --- 2. Price Standardization to Lakhs ---
def standardize_price_to_lakhs(price_str):
    if isinstance(price_str, str):
        price_str = price_str.replace("₹", "").replace(",", "").strip().lower()
        try:
            if "cr" in price_str:
                return float(re.findall(r'[\d.]+', price_str)[0]) * 100
            elif "l" in price_str:
                return float(re.findall(r'[\d.]+', price_str)[0])
            else:
                return float(price_str) / 100000
        except (ValueError, IndexError):
            return np.nan
    return np.nan

df['Price_lakhs'] = df['Price'].apply(standardize_price_to_lakhs)

# --- 3. Location and City Extraction ---
def extract_city(location_str):
    if isinstance(location_str, str):
        parts = location_str.split(',')
        return parts[-1].strip() if len(parts) > 1 else 'Unknown'
    return 'Unknown'

if 'Location' in df.columns:
    df['City'] = df['Location'].apply(extract_city)
else:
    df['City'] = 'Unknown'

# --- 4. BHK Extraction ---
def clean_bhk(bhk_str):
    if isinstance(bhk_str, str):
        match = re.search(r'(\d+\.?\d*)', bhk_str)
        if match:
            return float(match.group(1))
    return np.nan

if 'BHK' in df.columns:
    df['BHK_num'] = df['BHK'].apply(clean_bhk)
else:
    df['BHK_num'] = np.nan

# --- 5. Data Cleaning ---
df_clean = df.dropna(subset=["Total_Area", "Baths", "Price_lakhs"]).copy()

if df_clean.empty:
    raise ValueError("DataFrame is empty after cleaning. Check the input data.")

# --- 6. Feature Engineering ---
# WARNING: This feature causes data leakage, as it's derived from the target variable 'Price'.
# This will result in an overly optimistic R² score.
df_clean['Price_per_sqft'] = df_clean['Price_lakhs'] * 100000 / df_clean['Total_Area']

# --- 7. Log Transformations ---
df_clean['Total_Area_log'] = np.log1p(df_clean['Total_Area'])
df_clean['Baths_log'] = np.log1p(df_clean['Baths'])
df_clean['Price_per_sqft_log'] = np.log1p(df_clean['Price_per_sqft'])
df_clean['Price_lakhs_log'] = np.log(df_clean['Price_lakhs'])

numerical_features = ['Total_Area_log', 'Baths_log', 'Price_per_sqft_log']
categorical_features = []

if 'BHK_num' in df_clean.columns and not df_clean['BHK_num'].isnull().all():
    df_clean['BHK_num_log'] = np.log1p(df_clean['BHK_num'])
    numerical_features.append('BHK_num_log')
if 'City' in df_clean.columns:
    categorical_features.append('City')

# --- 8. Target Encoding & Train-Test Split ---
X = df_clean[numerical_features + categorical_features]
y = df_clean['Price_lakhs_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

final_numerical_features = numerical_features.copy()
if 'City' in categorical_features:
    city_mean_encoding = y_train.groupby(X_train['City']).mean()
    X_train['City_encoded'] = X_train['City'].map(city_mean_encoding)
    X_test['City_encoded'] = X_test['City'].map(city_mean_encoding)

    # FIX for FutureWarning: Use .loc to safely fill NaNs
    fill_value = y_train.mean()
    X_test.loc[:, 'City_encoded'] = X_test['City_encoded'].fillna(fill_value)

    final_numerical_features.append('City_encoded')
    X_train = X_train[final_numerical_features]
    X_test = X_test[final_numerical_features]

# --- 9. Standard Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 10. Build and Train Linear Regression Model ---
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# --- 11. Evaluation on Single Test Split ---
y_pred_log = model.predict(X_test_scaled)
y_pred = np.exp(y_pred_log)
y_test_original = np.exp(y_test)

r2 = r2_score(y_test_original, y_pred)
mae = mean_absolute_error(y_test_original, y_pred)

print("--- Linear Regression Model Performance (on single test split) ---")
print(f"R-squared (R²) : {r2:.4f}")
print(f"MAE            : {mae:,.2f} Lakhs")

# --- 12. Model Coefficients ---
coefficients = model.coef_
coeff_df = pd.DataFrame({
    'Feature': final_numerical_features,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

print("\n--- Model Coefficients ---")
print(coeff_df)

# --- 13. Robust Cross-Validation (FIXED) ---
# FIX for KeyError: Prepare a separate, fully encoded DataFrame for cross-validation
X_for_cv = X.copy()
if 'City' in categorical_features:
    # Use the entire target series 'y' to create the encoding
    full_city_encoding = y.groupby(X['City']).mean()
    X_for_cv['City_encoded'] = X_for_cv['City'].map(full_city_encoding)
    # Drop the original 'City' column
    X_for_cv = X_for_cv.drop(columns=['City'])

# Scale the correctly prepared data
X_full_scaled = scaler.fit_transform(X_for_cv)
y_full = y

cv_model = LinearRegression()
cv_scores = cross_val_score(cv_model, X_full_scaled, y_full, cv=5, scoring='r2')

print("\n--- Robust Performance (5-Fold Cross-Validation) ---")
print(f"Scores for each fold : {cv_scores}")
print(f"Average R² Score     : {np.mean(cv_scores):.4f}")
print(f"Std Dev of Scores    : {np.std(cv_scores):.4f}")

--- Linear Regression Model Performance (on single test split) ---
R-squared (R²) : 0.9956
MAE            : 2.80 Lakhs

--- Model Coefficients ---
              Feature  Coefficient
2  Price_per_sqft_log     0.702126
0      Total_Area_log     0.658754
1           Baths_log     0.000465
3        City_encoded    -0.007748

--- Robust Performance (5-Fold Cross-Validation) ---
Scores for each fold : [0.99935385 0.97632018 0.96483247 0.99941352 0.99906866]
Average R² Score     : 0.9878
Std Dev of Scores    : 0.0145


In [None]:
# Compare Actual vs.Predicted Values

# Create a new DataFrame for easy comparison
comparison_df = pd.DataFrame({
    'Actual_Price_Lakhs': y_test_original,
    'Predicted_Price_Lakhs': y_pred
})

# Calculate the difference (error) for each prediction
comparison_df['Difference_Lakhs'] = comparison_df['Actual_Price_Lakhs'] - comparison_df['Predicted_Price_Lakhs']

# Format the numbers for better readability
pd.options.display.float_format = '{:,.2f}'.format

# Display the first 10 predictions
print("\n--- Comparison of Actual vs. Predicted Prices (Test Set) ---")
print(comparison_df.head(10))


--- Comparison of Actual vs. Predicted Prices (Test Set) ---
       Actual_Price_Lakhs  Predicted_Price_Lakhs  Difference_Lakhs
3379                80.00                  77.29              2.71
1984                45.00                  44.04              0.96
14512               30.00                  29.57              0.43
12165               25.00                  25.09             -0.09
608                 70.00                  69.86              0.14
1953                45.00                  44.16              0.84
14247               75.00                  73.64              1.36
5496               119.00                 120.92             -1.92
1010                95.00                  94.21              0.79
7719                18.00                  17.85              0.15
