### Importing libraries

    This section imports all libraries utilised within the programme

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy

### File importing and preprocessing data
    This section imports all libraries utilised within the programme

In [3]:
#import data from the files
dataset = pd.read_csv('Concrete_Data_Yeh_final.csv')

#Data Preprocessing
#format as a dataframe
dataset = pd.DataFrame(dataset)#
#check for null values
dataset.isnull().sum()
#check for duplicates
dataset.duplicated().sum()
#check for data types
dataset.dtypes

print(dataset.head(5))

   cement   slag  flyash  water  superplasticizer  coarseaggregate  \
0   540.0    0.0     0.0  162.0               2.5           1040.0   
1   540.0    0.0     0.0  162.0               2.5           1055.0   
2   332.5  142.5     0.0  228.0               0.0            932.0   
3   332.5  142.5     0.0  228.0               0.0            932.0   
4   198.6  132.4     0.0  192.0               0.0            978.4   

   fineaggregate    age  csMPa  
0          676.0   28.0  79.99  
1          676.0   28.0  61.89  
2          594.0  270.0  40.27  
3          594.0    NaN  41.05  
4          825.5  360.0  44.30  


In [5]:
#split into training and test sets
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train)

[[ 480.     0.     0.  ...  936.   721.    28. ]
 [ 375.     0.     0.  ... 1038.   758.    28. ]
 [ 303.6  139.9    0.  ...  895.5  722.5   28. ]
 ...
 [ 144.     0.   175.  ...  943.   844.    28. ]
 [ 239.6  359.4    0.  ...  941.6  664.3   28. ]
 [ 192.   288.     0.  ...  929.8  716.1   90. ]]


### Regression functions

    This section defines functions for various regression functions which are later compared to choose the most effective model

In [9]:
# Build a regressor using sci-kit learn functionalities, including the rational choice of a regression algorithm, data splitting, training, testing, and analysis of the hyper-parameter choices. Justify the choice of algorithm and parameters with an analysis of their effect.
def linear_regression(X, y):
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Linear Regression model
    regressor = LinearRegression()

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [11]:
def decision_tree_regression(X, y):
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Decision Tree regressor
    regressor = DecisionTreeRegressor()

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [12]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_regression(X, y):
    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the Random Forest Regressor
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

    # Fitting the data
    regressor.fit(X_train, y_train)

    # Predicting the data
    y_pred = regressor.predict(X_test)

    # Calculating the r2 score
    print("R2 Score:", r2_score(y_test, y_pred))

    # Calculating the mean squared error
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    # Calculating the mean absolute error
    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

    return y_pred

In [14]:
from sklearn.neighbors import KNeighborsRegressor

# Create a KNN Regression model
regressor = KNeighborsRegressor(n_neighbors=5)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Print the R2 score
print(f"R2 Score: {r2_score(y_test, y_pred)}")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Model Evaluation

    This section analyses the performance of each regression model to determine which technique most accuratley predicts compressive strength
 
This section analyses the performance of each regression model to determine which technique most accuratley predicts compressive strength
    
    
    

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet

def test_each_column_with_models(dataset):
    # Define the models
    models = {
        'Linear Regression': LinearRegression(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet()
    }

    # Get the column names
    column_names = dataset.columns[:-1]

    # Loop over each model
    for model_name, model in models.items():
        print(f"Testing {model_name}")

        # Loop over each column
        for column in column_names:
            # Create the features (X) and target (y)
            X = dataset[[column]].values
            y = dataset.iloc[:, -1].values

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

            # Fit the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Print the R2 score
            print(f"R2 Score for {column}: {r2_score(y_test, y_pred)}")

            # Print the Mean Squared Error
            print(f"Mean Squared Error for {column}: {mean_squared_error(y_test, y_pred)}")

            # Print the Mean Absolute Error
            print(f"Mean Absolute Error for {column}: {mean_absolute_error(y_test, y_pred)}")

# Call the function
test_each_column_with_models(dataset)

### Model Predictions
    
    

In [None]:
#use the chosen regression method to predict compresive strngth based on the values given
