In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

def clone_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repo_url}...")
        os.system(f"git clone {repo_url}")
    else:
        print(f"Repository {repo_name} already cloned.")
    return repo_name

# Clone the Boston House Prices Regression repository
repo_url = "https://github.com/ine-rmotr-projects/boston-house-prices-regression.git"
repo_name = clone_repo(repo_url)

# Load the dataset
data_file = os.path.join(repo_name, "boston.csv")
if not os.path.exists(data_file):
    raise FileNotFoundError(f"Data file not found: {data_file}")

data = pd.read_csv(data_file)

# Preview the dataset
print("Preview of the dataset:")
print(data.head())

# Basic information
print("\nDataset information:")
print(data.info())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Split the data into features and target
X = data.drop("MEDV", axis=1)  # Features
y = data["MEDV"]  # Target (house prices)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Best model after tuning
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Feature importance
feature_importances = pd.Series(best_rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10, 6), title="Feature Importances")
plt.show()

# Save processed data (optional)
output_file = os.path.join(repo_name, "processed_boston_data.csv")
data.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")
