In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 2. Load Dataset
df = pd.read_csv("train.csv")
print("Shape:", df.shape)
print("Columns:\n", df.columns)

In [None]:
# 3. EDA - Target Variable
plt.figure(figsize=(8,5))
sns.histplot(df['SalePrice'], kde=True)
plt.title("Distribution of House Prices")
plt.show()

# 4. Correlation with target
corr = df.corr(numeric_only=True)['SalePrice'].sort_values(ascending=False)
print("\nTop correlated features with SalePrice:\n", corr.head(10))

# Visualize top features
top_features = corr.index[1:6]  # skip 'SalePrice' itself
sns.pairplot(df[top_features.to_list() + ['SalePrice']])
plt.suptitle("Pairplot of Top Correlated Features", y=1.02)
plt.show()

# 5. Data Preprocessing
# Selecting top numerical features
selected_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF']
X = df[selected_features]
y = df['SalePrice']

# Fill missing values if any
X = X.fillna(X.mean())

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 7. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# 8. Evaluation
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# 9. Coefficients
coeff_df = pd.DataFrame(lr.coef_, selected_features, columns=['Coefficient'])
print("\nModel Coefficients:\n", coeff_df)

# 10. Plot: Actual vs Predicted
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted House Prices")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # reference line
plt.grid(True)
plt.show()