In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text, plot_tree
from sklearn.metrics import mean_squared_error, r2_score

def clone_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repo_url}...")
        os.system(f"git clone {repo_url}")
    else:
        print(f"Repository {repo_name} already cloned.")
    return repo_name

# Clone the Boston Regression Tree Visualization repository
repo_url = "https://github.com/ine-rmotr-projects/visualizing-the-boston-regression-tree.git"
repo_name = clone_repo(repo_url)

# Locate the dataset file dynamically
possible_files = ["boston.csv", "data/boston.csv", "datasets/boston.csv"]
data_file = None
for file in possible_files:
    file_path = os.path.join(repo_name, file)
    if os.path.exists(file_path):
        data_file = file_path
        break

if not data_file:
    raise FileNotFoundError("Data file not found in the repository. Please check the repository structure.")

# Load the dataset
data = pd.read_csv(data_file)

# Preview the dataset
print("Preview of the dataset:")
print(data.head())

# Basic information
print("\nDataset information:")
print(data.info())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Split the data into features and target
X = data.drop("MEDV", axis=1)  # Features
y = data["MEDV"]  # Target (house prices)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42, max_depth=4)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Visualize the Decision Tree
plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=X.columns, filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()

# Print textual representation of the tree
print("\nTextual representation of the decision tree:")
print(export_text(dt_model, feature_names=list(X.columns)))

# Save predictions for further validation (optional)
predictions_file = os.path.join(repo_name, "boston_predictions.csv")
pd.DataFrame({"Actual": y_test, "Predicted": y_pred}).to_csv(predictions_file, index=False)
print(f"Predictions saved to {predictions_file}")
