In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Load dataset
HouseDF = pd.read_csv("House Price India.csv")


In [None]:
# Check for missing values
print("Missing Values:\n", HouseDF.isnull().sum())

# Drop duplicates
HouseDF = HouseDF.drop_duplicates()

# Display first few rows
HouseDF.head()

In [None]:
# Visualizations
sns.pairplot(HouseDF)
plt.show()

# Correlation heatmap
sns.heatmap(HouseDF.corr(), annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Feature and target split
x = HouseDF.drop('price', axis=1)
y = HouseDF['price']

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=101)


In [None]:
# Train linear regression model
lm = LinearRegression()
lm.fit(x_train, y_train)

# Predict
prediction = lm.predict(x_test)

# Evaluation
print("R² Score:", r2_score(y_test, prediction))
print("Mean Squared Error:", mean_squared_error(y_test, prediction))

In [None]:
# Plot predictions
plt.scatter(y_test, prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
plt.show()

In [None]:
# Save model
with open("model.pkl", "wb") as f:
    pickle.dump(lm, f)