<a href="https://colab.research.google.com/github/A-Burnhard/Boston-Housing-/blob/main/Boston_Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
regression_data = pd.read_csv('housing.csv')
regression_data.head()

**Pre-processing**

In [None]:
# Identify missing values
missing_values = regression_data.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
# Replace missing values with mean or median if any
regression_data.fillna(regression_data.mean(), inplace=True)

**Identifying Duplicates**

In [None]:
# Identify duplicate rows
duplicates = regression_data.duplicated()
print("Duplicates instances: \n",duplicates)

**Outlier detection**

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = iris_data

# Visualize the distribution of each feature using box plots
plt.figure(figsize=(10, 6))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.title('Boxplot of Features')
plt.show()

# Identify outliers using statistical methods (e.g., Z-score or IQR)
# Z-score method
from scipy.stats import zscore

data = iris_data.drop("class", axis=1)
z_scores = zscore(data)
outlier_threshold = 3  # Adjust the threshold as per your preference
outliers = (abs(z_scores) > outlier_threshold).any(axis=1)

# IQR method
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)

# Count the number of outliers
num_outliers = outliers.sum()
print(f"Number of outliers: {num_outliers}")

# Decide whether to remove outliers or transform them
remove_outliers = False

if remove_outliers:
    # Remove outliers from the dataset
    data = data[~outliers]
    print("Outliers removed.")
else:
    # Transform outliers to a specific value
    outlier_value = 8  # Choose an appropriate value for transformation
    data[outliers] = outlier_value
    print("Outliers transformed.")

# Updated visualization after handling outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.title('Boxplot of Features (After Outlier Handling)')
plt.show()

**Defining X and Y values**

In [None]:
# Separate the target variable (class) from the features
iris_data = pd.read_csv("iris.data", names=cols)

X = iris_data.drop('class', axis=1)
y = iris_data['class']

# Convert the target variable to numeric labels
label_mapping = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y = y.map(label_mapping)

**Influencial datapoint detection using leverage and cooks distance**

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence





# Add a constant term to the features matrix for the intercept in the linear regression model
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X)
results = model.fit()

# Get the influence statistics
influence = OLSInfluence(results)

# Calculate the leverage values
leverage = influence.hat_matrix_diag

# Calculate the Cook's distance
cooks_distance = influence.cooks_distance

# Identify influential data points based on leverage or Cook's distance
influential_points_leverage = leverage > 2 * (X.shape[1] + 1) / X.shape[0]
influential_points_cooks = cooks_distance[0] > 4 / (X.shape[0] - X.shape[1] - 1)

# Print the influential data points
print("Influential points based on leverage:")
print(X[influential_points_leverage])

print("\nInfluential points based on Cook's distance:")
print(X[influential_points_cooks])


**Normality of the set of features using shapiro**

In [None]:
import pandas as pd
from scipy.stats import shapiro



# Select the features to check for normality
features = X

# Perform Shapiro-Wilk test for each feature
for column in features.columns:
    stat, p_value = shapiro(features[column])
    alpha = 0.05  # Significance level

    print(f"Feature: {column}")
    print(f"Shapiro-Wilk test statistic: {stat}")
    print(f"P-value: {p_value}")

    if p_value > alpha:
        print("Feature appears to be normally distributed.")
    else:
        print("Feature does not appear to be normally distributed.")

    print()

**Data Transformation**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
X,y

# Separate the target variable (class) from the features

# Perform normalization using Min-Max scaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Perform standardization using StandardScaler
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
print(X)

**Feature Selection**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# Create a Random Forest regressor
rf = RandomForestRegressor()

# Fit the Random Forest model
rf.fit(X, y)

# Get the feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame of feature importances
feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the features by importance (descending order)
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(feature_importances_df)

**Oversampling techniques using the Synthetic Minority Over-sampling Technique (SMOTE) to balance the imbalanced dataset**

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE


# Create a SMOTE object
smote = SMOTE()

# Perform oversampling
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the balanced class distribution
print("Class distribution after SMOTE:")
print(y_resampled.value_counts())

**Selecting Appropriate Learners for Training and Validation (Decision trees  and Gradient boosting)**

In [None]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, LeaveOneOut, train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
data = load_boston()
X, y = data.data, data.target

# Define the models
decision_tree_model = DecisionTreeRegressor(random_state=42)
xgboost_model = XGBRegressor(random_state=42)

# Function to perform K-fold cross-validation
def k_fold_cross_validation(model, X, y, k):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_scores = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)

# Function to perform Leave-One-Out cross-validation
def leave_one_out_cross_validation(model, X, y):
    loo = LeaveOneOut()
    mse_scores = []
    for train_index, val_index in loo.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)

# Function to perform Percentage Split validation
def percentage_split_validation(model, X, y, test_size=0.3):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)

    return mse

# Perform cross-validation and validation on the models
k = 5  # K-fold cross-validation: number of folds
mse_dt_kfold = k_fold_cross_validation(decision_tree_model, X, y, k)
mse_xgboost_kfold = k_fold_cross_validation(xgboost_model, X, y, k)

mse_dt_leave_one_out = leave_one_out_cross_validation(decision_tree_model, X, y)
mse_xgboost_leave_one_out = leave_one_out_cross_validation(xgboost_model, X, y)

mse_dt_percentage_split = percentage_split_validation(decision_tree_model, X, y, test_size=0.3)
mse_xgboost_percentage_split = percentage_split_validation(xgboost_model, X, y, test_size=0.3)

# Print the mean squared errors for each validation method
print("Mean Squared Error (Decision Tree) - K-fold Cross-Validation:", mse_dt_kfold)
print("Mean Squared Error (XGBoost) - K-fold Cross-Validation:", mse_xgboost_kfold)

print("Mean Squared Error (Decision Tree) - Leave-One-Out Cross-Validation:", mse_dt_leave_one_out)
print("Mean Squared Error (XGBoost) - Leave-One-Out Cross-Validation:", mse_xgboost_leave_one_out)

print("Mean Squared Error (Decision Tree) - Percentage Split Validation:", mse_dt_percentage_split)
print("Mean Squared Error (XGBoost) - Percentage Split Validation:", mse_xgboost_percentage_split)


**calculate RMSE and R-squared**

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_error, r2_score

# Calculate RMSE
rmse_dt_kfold = np.sqrt(mse_dt_kfold)
rmse_xgboost_kfold = np.sqrt(mse_xgboost_kfold)

rmse_dt_leave_one_out = np.sqrt(mse_dt_leave_one_out)
rmse_xgboost_leave_one_out = np.sqrt(mse_xgboost_leave_one_out)

rmse_dt_percentage_split = np.sqrt(mse_dt_percentage_split)
rmse_xgboost_percentage_split = np.sqrt(mse_xgboost_percentage_split)

# Calculate R-squared
r2_dt_kfold = r2_score(y, decision_tree_model.predict(X))
r2_xgboost_kfold = r2_score(y, xgboost_model.predict(X))

r2_dt_leave_one_out = r2_score(y, decision_tree_model.predict(X))
r2_xgboost_leave_one_out = r2_score(y, xgboost_model.predict(X))

r2_dt_percentage_split = r2_score(y, decision_tree_model.predict(X))
r2_xgboost_percentage_split = r2_score(y, xgboost_model.predict(X))

# Print the evaluation results for each model and validation method
print("Evaluation results for Decision Tree:")
print("K-fold Cross-Validation - MSE:", mse_dt_kfold, "RMSE:", rmse_dt_kfold, "R-squared:", r2_dt_kfold)
print("Leave-One-Out Cross-Validation - MSE:", mse_dt_leave_one_out, "RMSE:", rmse_dt_leave_one_out, "R-squared:", r2_dt_leave_one_out)
print("Percentage Split Validation - MSE:", mse_dt_percentage_split, "RMSE:", rmse_dt_percentage_split, "R-squared:", r2_dt_percentage_split)

print("\nEvaluation results for XGBoost:")
print("K-fold Cross-Validation - MSE:", mse_xgboost_kfold, "RMSE:", rmse_xgboost_kfold, "R-squared:", r2_xgboost_kfold)
print("Leave-One-Out Cross-Validation - MSE:", mse_xgboost_leave_one_out, "RMSE:", rmse_xgboost_leave_one_out, "R-squared:", r2_xgboost_leave_one_out)
print("Percentage Split Validation - MSE:", mse_xgboost_percentage_split, "RMSE:", rmse_xgboost_percentage_split, "R-squared:", r2_xgboost_percentage_split)
