# Import necessary libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('house_prices.csv')

# Display basic information and the first few rows of the dataset
df.info()
df.head()


## Feature Engineering

In [None]:
# Feature Engineering

# Create new features
df['house_age'] = pd.to_datetime('today').year - pd.to_datetime(df['date_added']).dt.year

# Assuming 'Area Size' is in square feet, create a feature for 'bedrooms_per_area'
df['bedrooms_per_area'] = df['bedrooms'] / df['area']

# Encode categorical features
df_encoded = pd.get_dummies(df, columns=['property_type', 'location', 'city', 'province_name', 'Area Type', 'Area Size', 'Area Category'])

# Drop unnecessary columns
df_encoded.drop(['property_id', 'location_id', 'page_url', 'date_added', 'agency', 'agent'], axis=1, inplace=True)

# Show the transformed DataFrame
df_encoded.head()


# Outlier Analysis

In [None]:
# Outlier Analysis

# Calculate Z-scores for numeric columns to identify outliers
from scipy import stats

numeric_cols = ['price', 'baths', 'area', 'bedrooms', 'house_age', 'bedrooms_per_area']
df_encoded = df_encoded[(np.abs(stats.zscore(df_encoded[numeric_cols])) < 3).all(axis=1)]

# Check for outliers in 'price'
plt.figure(figsize=(10, 6))
sns.boxplot(df_encoded['price'])
plt.title('Boxplot of House Prices')
plt.show()

# Investigate outliers
outliers = df_encoded[df_encoded['price'] > df_encoded['price'].quantile(0.95)]
print("Outliers:")
print(outliers)


##  Predictive Modeling

In [None]:
# Prepare features and target variable
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    
    print(f"Model: {name}")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    print()

# Display model comparison
results_df = pd.DataFrame(results).T
print(results_df)


## Future Price Prediction

In [None]:

# Define a function for future price prediction
def predict_price(model, features):
    return model.predict([features])[0]

# Example hypothetical scenario
example_features = {
    'baths': 2,
    'area': 1500,
    'bedrooms': 3,
    'house_age': 5,
    'bedrooms_per_area': 0.002
}

# Convert the example features to the same format as the training data
example_features_df = pd.DataFrame([example_features])
example_features_encoded = pd.get_dummies(example_features_df, columns=['property_type', 'location', 'city', 'province_name', 'Area Type', 'Area Size', 'Area Category'])

# Make sure all columns in the encoded DataFrame match the training data
example_features_encoded = example_features_encoded.reindex(columns=X.columns, fill_value=0)

# Predict using the best model (e.g., Random Forest)
best_model = models['Random Forest']
predicted_price = predict_price(best_model, example_features_encoded.iloc[0])
print(f"Predicted Price: {predicted_price}")


### Prepare a report summarizing the findings

In [1]:
# Prepare a report summarizing the findings

from IPython.display import display, Markdown

report = """
# House Price Analysis and Prediction Report

## 1. Data Exploration
- Data was successfully loaded and initial inspection showed columns including property features, location, and pricing.

## 2. Feature Engineering
- New features created include `house_age` and `bedrooms_per_area`.
- Categorical features were encoded using one-hot encoding.

## 3. Outlier Analysis
- Outliers in `price` were identified using Z-scores and boxplots.
- Significant outliers were examined and further investigation suggested potential reasons for their deviations.

## 4. Predictive Modeling
- Models trained: Linear Regression, Random Forest, Gradient Boosting.
- Random Forest achieved the best performance with the lowest Mean Squared Error (MSE) and highest R-squared (R2) score.

## 5. Future Price Prediction
- Example predictions for hypothetical scenarios using the Random Forest model were demonstrated.
- Predictions can be made for houses with specific features based on the trained model.

## 6. Recommendations
- Further data collection on property features and market conditions could improve model accuracy.
- Exploration of additional machine learning algorithms and feature engineering techniques may yield better results.
"""

display(Markdown(report))



# House Price Analysis and Prediction Report

## 1. Data Exploration
- Data was successfully loaded and initial inspection showed columns including property features, location, and pricing.

## 2. Feature Engineering
- New features created include `house_age` and `bedrooms_per_area`.
- Categorical features were encoded using one-hot encoding.

## 3. Outlier Analysis
- Outliers in `price` were identified using Z-scores and boxplots.
- Significant outliers were examined and further investigation suggested potential reasons for their deviations.

## 4. Predictive Modeling
- Models trained: Linear Regression, Random Forest, Gradient Boosting.
- Random Forest achieved the best performance with the lowest Mean Squared Error (MSE) and highest R-squared (R2) score.

## 5. Future Price Prediction
- Example predictions for hypothetical scenarios using the Random Forest model were demonstrated.
- Predictions can be made for houses with specific features based on the trained model.

## 6. Recommendations
- Further data collection on property features and market conditions could improve model accuracy.
- Exploration of additional machine learning algorithms and feature engineering techniques may yield better results.
