In [None]:
# House Prices Prediction Project - Detailed Notebook

# 1. Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Load Dataset
df = pd.read_csv('train.csv')  # Assuming train.csv from Kaggle dataset

# 3. Data Inspection
print(df.shape)
print(df.info())
print(df.describe())
print(df.head())

# 4. Data Cleaning and Preprocessing
# Checking missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
print(missing)

# Drop columns with too many missing values
drop_cols = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']
df = df.drop(columns=drop_cols)

# Fill missing numerical features with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Fill missing categorical features with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical features
df = pd.get_dummies(df, drop_first=True)

# 5. EDA - Statistical Summary
print(df['SalePrice'].describe())

# Correlation with SalePrice
correlations = df.corr()['SalePrice'].sort_values(ascending=False)
print(correlations.head(10))

# 6. Visualizations
# Matplotlib
plt.figure(figsize=(8,6))
plt.hist(df['SalePrice'], bins=30, color='skyblue')
plt.title('Distribution of Sale Prices')
plt.xlabel('Sale Price')
plt.ylabel('Number of Houses')
plt.show()

plt.figure(figsize=(8,6))
plt.scatter(df['GrLivArea'], df['SalePrice'], alpha=0.5)
plt.title('Living Area vs Sale Price')
plt.xlabel('Ground Living Area')
plt.ylabel('Sale Price')
plt.show()

# Seaborn
plt.figure(figsize=(8,6))
sns.boxplot(x='OverallQual', y='SalePrice', data=df)
plt.title('Sale Price vs Overall Quality')
plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Heatmap of Features')
plt.show()

# Extra visualization
sns.pairplot(df[['SalePrice', 'GrLivArea', 'GarageArea', 'TotalBsmtSF']])
plt.show()

# 7. Modeling
# Features and Target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

# 8. Conclusion
# This model captures major trends but can be improved with feature engineering and advanced models.
