# Import Required Libraries
Import necessary libraries for data manipulation, visualization, and machine learning (pandas, numpy, matplotlib, seaborn, scikit-learn, etc.).

In [None]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load and Prepare Dataset
Load a dataset for machine learning, either from a local file or an online source like scikit-learn datasets.

In [None]:
# Load and Prepare Dataset

# Load dataset from scikit-learn
from sklearn.datasets import load_boston

# Load the Boston housing dataset
boston = load_boston()

# Create a DataFrame from the dataset
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Display the first few rows of the DataFrame
df.head()

# Split the dataset into features (X) and target (y)
X = df.drop('PRICE', axis=1)
y = df['PRICE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Exploratory Data Analysis
Analyze the dataset to understand its structure, check for missing values, and visualize distributions of features.

In [None]:
# Exploratory Data Analysis

# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Summary statistics of the dataset
summary_stats = df.describe()
print("Summary statistics of the dataset:\n", summary_stats)

# Visualize the distribution of the target variable (PRICE)
plt.figure(figsize=(10, 6))
sns.histplot(df['PRICE'], kde=True, bins=30)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Pairplot of features to visualize relationships
sns.pairplot(df)
plt.show()

# Data Preprocessing
Clean the data, handle missing values, encode categorical variables, and scale numerical features as needed.

In [None]:
# Handle missing values (if any)
# Since there are no missing values in the dataset, we can skip this step

# Encode categorical variables (if any)
# The Boston housing dataset does not contain categorical variables, so we can skip this step

# Scale numerical features
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data back to DataFrame for better readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display the first few rows of the scaled training data
X_train_scaled.head()

# Split Data into Training and Testing Sets
Divide the dataset into training and testing sets to evaluate model performance.

In [None]:
# Split Data into Training and Testing Sets

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Implement Linear Models
Fit linear models such as Linear Regression, Logistic Regression, or Ridge/Lasso Regression to the data.

In [None]:
# Implement Linear Models

# Import necessary libraries for linear models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the models
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
lasso_reg = Lasso(alpha=0.1)

# Fit the models on the training data
linear_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)
lasso_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_linear = linear_reg.predict(X_test_scaled)
y_pred_ridge = ridge_reg.predict(X_test_scaled)
y_pred_lasso = lasso_reg.predict(X_test_scaled)

# Evaluate the models
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the performance metrics
print(f"Linear Regression - MSE: {mse_linear:.2f}, R2: {r2_linear:.2f}")
print(f"Ridge Regression - MSE: {mse_ridge:.2f}, R2: {r2_ridge:.2f}")
print(f"Lasso Regression - MSE: {mse_lasso:.2f}, R2: {r2_lasso:.2f}")

# Plot the performance of the models
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression']
mse_values = [mse_linear, mse_ridge, mse_lasso]
r2_values = [r2_linear, r2_ridge, r2_lasso]

plt.figure(figsize=(14, 6))

# Plot MSE values
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color=['blue', 'green', 'red'])
plt.title('Mean Squared Error of Linear Models')
plt.ylabel('MSE')

# Plot R2 values
plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color=['blue', 'green', 'red'])
plt.title('R2 Score of Linear Models')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Implement Tree-Based Models
Implement decision tree-based models like Random Forest and Gradient Boosting.

In [None]:
# Implement Tree-Based Models

# Import necessary libraries for tree-based models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Initialize the models
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the models on the training data
random_forest.fit(X_train_scaled, y_train)
gradient_boosting.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_rf = random_forest.predict(X_test_scaled)
y_pred_gb = gradient_boosting.predict(X_test_scaled)

# Evaluate the models
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print the performance metrics
print(f"Random Forest - MSE: {mse_rf:.2f}, R2: {r2_rf:.2f}")
print(f"Gradient Boosting - MSE: {mse_gb:.2f}, R2: {r2_gb:.2f}")

# Plot the performance of the models
models.extend(['Random Forest', 'Gradient Boosting'])
mse_values.extend([mse_rf, mse_gb])
r2_values.extend([r2_rf, r2_gb])

plt.figure(figsize=(14, 6))

# Plot MSE values
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.title('Mean Squared Error of Models')
plt.ylabel('MSE')

# Plot R2 values
plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color=['blue', 'green', 'red', 'purple', 'orange'])
plt.title('R2 Score of Models')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Implement Neural Network
Create and train a simple neural network model using a framework like sklearn's MLPClassifier or a basic Keras model.

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Implement Neural Network
import torch.nn as nn
import torch.optim as optim

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader for training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the LSTM model
class LSTMRegressor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMRegressor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize the model, loss function, and optimizer
input_size = X_train_scaled.shape[1]
hidden_size = 50
num_layers = 2
output_size = 1

model = LSTMRegressor(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
model.train()
for epoch in range(num_epochs):
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.unsqueeze(1)  # Add sequence dimension
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.unsqueeze(1)  # Add sequence dimension
    y_pred_tensor = model(X_test_tensor)
    y_pred_mlp = y_pred_tensor.cpu().numpy()

# Calculate performance metrics
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

# Print the performance metrics
print(f"LSTM Neural Network - MSE: {mse_mlp:.2f}, R2: {r2_mlp:.2f}")

# Plot the performance of the models including the neural network
models.append('LSTM Neural Network')
mse_values.append(mse_mlp)
r2_values.append(r2_mlp)

plt.figure(figsize=(14, 6))

# Plot MSE values
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('Mean Squared Error of Models')
plt.ylabel('MSE')

# Plot R2 values
plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('R2 Score of Models')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Evaluate Model Performance
Calculate performance metrics such as accuracy, precision, recall, F1-score, or RMSE for regression problems.

In [None]:
# Evaluate Model Performance

# Calculate performance metrics for each model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# Function to calculate and print performance metrics
def print_performance_metrics(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Evaluate Linear Regression
print_performance_metrics(y_test, y_pred_linear, "Linear Regression")

# Evaluate Ridge Regression
print_performance_metrics(y_test, y_pred_ridge, "Ridge Regression")

# Evaluate Lasso Regression
print_performance_metrics(y_test, y_pred_lasso, "Lasso Regression")

# Evaluate Random Forest
print_performance_metrics(y_test, y_pred_rf, "Random Forest")

# Evaluate Gradient Boosting
print_performance_metrics(y_test, y_pred_gb, "Gradient Boosting")

# Evaluate Neural Network
print_performance_metrics(y_test, y_pred_mlp, "Neural Network")

# Plot the performance of all models
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Random Forest', 'Gradient Boosting', 'Neural Network']
mse_values = [mean_squared_error(y_test, y_pred_linear), mean_squared_error(y_test, y_pred_ridge), mean_squared_error(y_test, y_pred_lasso), mean_squared_error(y_test, y_pred_rf), mean_squared_error(y_test, y_pred_gb), mean_squared_error(y_test, y_pred_mlp)]
r2_values = [r2_score(y_test, y_pred_linear), r2_score(y_test, y_pred_ridge), r2_score(y_test, y_pred_lasso), r2_score(y_test, y_pred_rf), r2_score(y_test, y_pred_gb), r2_score(y_test, y_pred_mlp)]

plt.figure(figsize=(14, 6))

# Plot MSE values
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('Mean Squared Error of Models')
plt.ylabel('MSE')

# Plot R2 values
plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('R2 Score of Models')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Visualize Performance Metrics
Create visualizations of model performance using bar charts, ROC curves, confusion matrices, or learning curves.

In [None]:
# Visualize Performance Metrics

# Import necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix

# Create a bar chart for Mean Squared Error (MSE) of all models
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('Mean Squared Error of Models')
plt.ylabel('MSE')

# Create a bar chart for R2 Score of all models
plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])
plt.title('R2 Score of Models')
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()

# Plot ROC curves for classification models (if applicable)
# Note: The Boston housing dataset is for regression, so ROC curves are not applicable here

# Plot confusion matrices for classification models (if applicable)
# Note: The Boston housing dataset is for regression, so confusion matrices are not applicable here

# Plot learning curves for models (if applicable)
# Note: Learning curves are not included in the current implementation

# Since the dataset is for regression, we will not plot ROC curves or confusion matrices
# Instead, we will focus on the performance metrics already visualized above

# Compare Models
Generate comparison tables and plots to highlight differences in performance metrics across all implemented models.

In [None]:
# Compare Models

# Create a DataFrame to store the performance metrics of all models
performance_df = pd.DataFrame({
    'Model': models,
    'Mean Squared Error': mse_values,
    'R2 Score': r2_values
})

# Display the performance metrics table
performance_df

# Plot the performance metrics for comparison
plt.figure(figsize=(14, 6))

# Plot Mean Squared Error (MSE) values
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Mean Squared Error', data=performance_df, palette='viridis')
plt.title('Mean Squared Error of Models')
plt.xticks(rotation=45)
plt.ylabel('MSE')

# Plot R2 Score values
plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='R2 Score', data=performance_df, palette='viridis')
plt.title('R2 Score of Models')
plt.xticks(rotation=45)
plt.ylabel('R2 Score')

plt.tight_layout()
plt.show()