<a href="https://colab.research.google.com/github/BenikkaRamakrishnan/house-price_prediction/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
house_data = pd.read_csv('/content/House Price Prediction Dataset.csv')
print(house_data.head())
print(house_data.shape)
print(house_data.info())

# Define scaler and label encoder
sc = StandardScaler()
le = LabelEncoder()

# Separate features and target
x = house_data.drop("Price", axis=1)
y = house_data["Price"]

# Define numerical and categorical columns
numerical_features = ['Area', 'Bedrooms', 'Bathrooms', 'Floors']  # Includes 'Stories'
categorical_features = ['Location', 'Condition', 'Garage']  # Exclude 'Stories' from here

# Apply StandardScaler to numerical features
x_numerical = pd.DataFrame(sc.fit_transform(x[numerical_features]), columns=numerical_features)

# Apply LabelEncoder to categorical features (not 'Stories')
x_categorical = x[categorical_features].copy()
for feature in categorical_features:
    x_categorical[feature] = le.fit_transform(x_categorical[feature])

# Combine processed features
x_processed = pd.concat([x_numerical, x_categorical], axis=1)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)

# ***Handle NaN values in y_train (target variable)***
# Remove rows with NaN in y_train
train_data = pd.concat([X_train, y_train], axis=1)  # Combine X_train and y_train
train_data.dropna(subset=['Price'], inplace=True)  # Drop rows with NaN in 'Price' column
X_train = train_data.drop('Price', axis=1)       # Update X_train
y_train = train_data['Price']                   # Update y_train

# ***Handle NaN values in y_test (target variable)***
# Remove rows with NaN in y_test
test_data = pd.concat([X_test, y_test], axis=1)   # Combine X_test and y_test
test_data.dropna(subset=['Price'], inplace=True)   # Drop rows with NaN in 'Price' column
X_test = test_data.drop('Price', axis=1)        # Update X_test
y_test = test_data['Price']                    # Update y_test


# Initialize the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2_score = model.score(X_test, y_test)

# Output the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2_score}")

# Visualizing the predictions vs actual values
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()

# If you want to make a new prediction (example)
sample_data = pd.DataFrame({
    'Area': [2500],
    'Bedrooms': [4],
    'Bathrooms': [3],
    'Floors': [2],  # This is numeric, so no LabelEncoder is applied
    'Location': ['Suburban'],  # Unseen category (should be handled) #Unseen category, should be handled
    'Condition': ['Good'],   # Seen during training
    'Garage': ['Yes']        # Seen during training
})

# Apply the same preprocessing as before
sample_data_numerical = pd.DataFrame(sc.transform(sample_data[numerical_features]), columns=numerical_features)

# Encode the categorical features and handle unseen labels
sample_data_categorical = sample_data[categorical_features].copy()
for feature in categorical_features:
    try:
        sample_data_categorical[feature] = le.transform(sample_data_categorical[feature])
    except ValueError:
        # If unseen category, map it to the default value (last known category)
        sample_data_categorical[feature] = le.transform([le.classes_[-1]])

# Concatenate processed features
sample_data_processed = pd.concat([sample_data_numerical, sample_data_categorical], axis=1)

# Make a prediction for the sample data
predicted_price = model.predict(sample_data_processed)
print(f"Predicted price for the sample house: {predicted_price[0]}")