In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of the training data
train_data.head()

# Data Preprocessing
# Check for missing values
missing_values = train_data.isnull().sum()
missing_values[missing_values > 0]

# Handle missing values (example strategies)
train_data.fillna({
    'LotFrontage': train_data['LotFrontage'].mean(),
    'Alley': 'No Alley',
    'BsmtQual': 'No Basement',
    'GarageType': 'No Garage',
    'PoolQC': 'No Pool',
    # Add more as needed
}, inplace=True)

# Convert categorical variables to numerical (one-hot encoding)
train_data = pd.get_dummies(train_data, drop_first=True)

# Feature selection
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']  # Example features
X = train_data[features]
y = train_data['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
val_predictions = model.predict(X_val)

# Model evaluation
mse = mean_squared_error(y_val, val_predictions)
print(f'Mean Squared Error: {mse}')

# Visualize feature importance
importances = model.feature_importances_
sns.barplot(x=importances, y=features)
plt.title('Feature Importance')
plt.show()

# Exploratory Data Analysis (EDA)
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train_data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Distribution of SalePrice
plt.figure(figsize=(10, 6))
sns.histplot(train_data['SalePrice'], bins=30, kde=True)
plt.title('SalePrice Distribution')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()