### Importing libraries

    This section imports all libraries utilised within the programme

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy

# Importing necessary libraries for regression models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Importing necessary libraries for splitting data and calculating metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

### File importing and preprocessing data
    This section imports all libraries utilised within the programme

In [None]:
#import data from the files
dataset = pd.read_csv('Concrete_Data_Yeh_final.csv')

#Data Preprocessing
#format as a dataframe
dataset = pd.DataFrame(dataset)#
#check for null values
dataset.isnull().sum()
#check for duplicates
dataset.duplicated().sum()
#check for data types
dataset.dtypes

print(dataset.head(5))

In [None]:
#split into training and test sets
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train)

### Regression functions

    This section defines functions for various regression functions which are later compared to choose the most effective model

The regression models included are:

1. **Linear Regression (`linear_regression`)**: This model assumes a linear relationship between the independent and dependent variables. It is simple and provides interpretable results.

2. **Decision Tree Regression (`decision_tree_regression`)**: This model uses a decision tree to predict the dependent variable based on the independent variables. It is a non-parametric method and can capture complex relationships.

3. **Random Forest Regression (`random_forest_regression`)**: This model uses a collection of decision trees to make predictions. It is robust to overfitting and can handle large datasets with many variables.

Each function splits the data into a training set and a test set, fits the model to the training data, makes predictions on the test data, and calculates several metrics to evaluate the performance of the model. These metrics include the R2 score, the mean squared error, and the mean absolute error.

In [None]:
def linear_regression(X, y):

    """
    This function applies the Linear Regression model to the given dataset.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The independent variables, i.e., the input for the model.
    y (numpy.ndarray or pandas.Series): The dependent variable, i.e., the output for the model.

    Returns:
    numpy.ndarray: The predicted values of the dependent variable for the test set.

    Prints:
    R2 Score: The coefficient of determination, a statistical measure of how well the regression predictions approximate the real data points.
    Mean Squared Error: The average squared difference between the estimated values and the actual value.
    Mean Absolute Error: The average absolute difference between the estimated values and the actual value.
    """
    
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Linear Regression model
    regressor = LinearRegression()

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [None]:
def decision_tree_regression(X, y):
    """
    This function applies the Decision Tree Regression model to the given dataset.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The independent variables, i.e., the input for the model.
    y (numpy.ndarray or pandas.Series): The dependent variable, i.e., the output for the model.

    Returns:
    numpy.ndarray: The predicted values of the dependent variable for the test set.

    Prints:
    R2 Score: The coefficient of determination, a statistical measure of how well the regression predictions approximate the real data points.
    Mean Squared Error: The average squared difference between the estimated values and the actual value.
    Mean Absolute Error: The average absolute difference between the estimated values and the actual value.
    """
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Decision Tree regressor
    regressor = DecisionTreeRegressor()

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [None]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_regression(X, y):
    """
    This function applies the Random Forest Regression model to the given dataset.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The independent variables, i.e., the input for the model.
    y (numpy.ndarray or pandas.Series): The dependent variable, i.e., the output for the model.

    Returns:
    numpy.ndarray: The predicted values of the dependent variable for the test set.

    Prints:
    R2 Score: The coefficient of determination, a statistical measure of how well the regression predictions approximate the real data points.
    Mean Squared Error: The average squared difference between the estimated values and the actual value.
    Mean Absolute Error: The average absolute difference between the estimated values and the actual value.
    """
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Random Forest Regressor
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [None]:
def elastic_net_regression(X, y):
    """
    This function applies the Elastic Net Regression model to the given dataset.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The independent variables, i.e., the input for the model.
    y (numpy.ndarray or pandas.Series): The dependent variable, i.e., the output for the model.

    Returns:
    numpy.ndarray: The predicted values of the dependent variable for the test set.

    Prints:
    R2 Score: The coefficient of determination, a statistical measure of how well the regression predictions approximate the real data points.
    Mean Squared Error: The average squared difference between the estimated values and the actual value.
    Mean Absolute Error: The average absolute difference between the estimated values and the actual value.
    """
    
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Elastic Net Regressor
    regressor = ElasticNet(random_state = 0)

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [None]:
def ridge_regression(X, y):
    """
    This function applies the Ridge Regression model to the given dataset.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The independent variables, i.e., the input for the model.
    y (numpy.ndarray or pandas.Series): The dependent variable, i.e., the output for the model.

    Returns:
    numpy.ndarray: The predicted values of the dependent variable for the test set.

    Prints:
    R2 Score: The coefficient of determination, a statistical measure of how well the regression predictions approximate the real data points.
    Mean Squared Error: The average squared difference between the estimated values and the actual value.
    Mean Absolute Error: The average absolute difference between the estimated values and the actual value.
    """
    
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Ridge Regressor
    regressor = Ridge(random_state = 0)

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [None]:
# Create a KNN Regression model
regressor = KNeighborsRegressor(n_neighbors=5)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Print the R2 score
print(f"R2 Score: {r2_score(y_test, y_pred)}")

### Model Evaluation

This section analyses the performance of each regression model to determine which technique most accuratley predicts compressive strength
    
    
    

In [None]:
def test_each_column_with_models(dataset):
    """
    This function applies various regression models to each column of the given dataset.

    Parameters:
    dataset (pandas.DataFrame): The dataset where each column is treated as a dependent variable in turn, with the other columns serving as independent variables.

    Prints:
    For each column, the function prints the name of the column, the regression model used, and the performance metrics of the model (R2 Score, Mean Squared Error, Mean Absolute Error).

    Note:
    The specific regression models used and the way in which the data is split into independent and dependent variables would depend on the implementation within the function.
    """
    
    # Define the models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree Regression': decision_tree_regression(),
        'Random Forest Regression': random_forest_regression(),
        'Ridge': Ridge(),
        'KNN Regression': KNeighborsRegressor(n_neighbors=5),
        'Lasso': Lasso(),
        'Elastic Net': ElasticNet()
            }

    # Get the column names
    column_names = dataset.columns[:-1]

    # Loop over each model
    for model_name, model in models.items():
        print(f"Testing {model_name}")

        # Loop over each column
        for column in column_names:
            # Create the features (X) and target (y)
            X = dataset[[column]].values
            y = dataset.iloc[:, -1].values

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

            # Fit the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Print the R2 score
            print(f"R2 Score for {column}: {r2_score(y_test, y_pred)}")

            # Print the Mean Squared Error
            print(f"Mean Squared Error for {column}: {mean_squared_error(y_test, y_pred)}")

            # Print the Mean Absolute Error
            print(f"Mean Absolute Error for {column}: {mean_absolute_error(y_test, y_pred)}")

# Call the function
test_each_column_with_models(dataset)

V2

In [None]:
# List of regression models to test
models = [LinearRegression(), ElasticNet(), Ridge(), DecisionTreeRegressor(), RandomForestRegressor()]

# Generate all combinations of variables
for r in range(1, len(X.columns) + 1):
    for variables in combinations(X.columns, r):
        X_subset = X[list(variables)]

        # Fit each model to the data
        for model in models:
            scores = cross_val_score(model, X_subset, y, cv=5)
            print(f'Model: {model.__class__.__name__}, Variables: {variables}, Score: {np.mean(scores)}')

### Model Predictions
    
    

In [None]:
#use the chosen regression method to predict compresive strngth based on the values given
