In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

data = pd.read_csv('datasets/result.csv')

# old 
data.head()

Unnamed: 0,perimeter,area,height,width,shape_index,compactness,weight
0,351.663995,8141.5,90,114,126.666667,0.827291,58.0
1,362.977703,8445.5,86,124,144.186047,0.805518,56.0
2,359.078207,8310.0,88,116,131.818182,0.809903,54.0
3,362.592927,8421.0,88,118,134.090909,0.804887,58.0
4,356.936072,8322.0,90,115,127.777778,0.820837,58.0


In [14]:
def calculateLength(width_pixel, height_pixel):
    ref_x_min, ref_x_max = 417, 473  # Replace with the reference bounding box in pixels
    real_length_of_reference = 3  # Example: 10 cm
    ref_length_pixel = ref_x_max - ref_x_min
    conversion_factor = real_length_of_reference / ref_length_pixel

    real_width = width_pixel * conversion_factor
    real_height = height_pixel * conversion_factor
    
    return real_width, real_height

def computeFeatures(width, height, area, perimeter):
    shape_index = (width / height) * 100 if height > 0 else 0
    compactness = (4 * np.pi * area) / (perimeter ** 2) if perimeter > 0 else 0
    return shape_index, compactness

def calculatePerimeterArea(perimeter, area):
    ref_x_min, ref_x_max = 417, 473  # Replace with the reference bounding box in pixels
    real_length_of_reference = 3  # Example: 10 cm
    ref_length_pixel = ref_x_max - ref_x_min
    conversion_factor = real_length_of_reference / ref_length_pixel

    real_perimeter = perimeter * conversion_factor
    real_area = area * conversion_factor

    return real_perimeter, real_area


In [15]:
# Apply the calculateLength and computeFeatures functions to the data
data[['real_width', 'real_height']] = data.apply(lambda row: calculateLength(row['width'], row['height']), axis=1, result_type='expand')
data[['real_perimeter', 'real_area']] = data.apply(lambda row: calculatePerimeterArea(row['perimeter'], row['area']), axis=1, result_type='expand')
data[['new_shape_index', 'new_compactness']] = data.apply(lambda row: computeFeatures(row['real_width'], row['real_height'], row['real_area'], row['real_perimeter']), axis=1, result_type='expand')

# #  Drop the old columns and rename the new ones
data = data.drop(columns=['perimeter','area','width', 'height', 'shape_index', 'compactness'])
data = data.rename(columns={'real_perimeter':'perimeter','real_area':'area','real_width': 'width', 'real_height': 'height', 'new_shape_index': 'shape_index', 'new_compactness': 'compactness'})
X = data[['perimeter', 'area', 'height', 'width', 'shape_index', 'compactness']] 
y = data[['weight']]

data = data[['perimeter', 'area', 'height', 'width', 'shape_index', 'compactness', 'weight']]
data.head()

Unnamed: 0,perimeter,area,height,width,shape_index,compactness,weight
0,18.839143,436.151786,4.821429,6.107143,126.666667,15.442772,58.0
1,19.445234,452.4375,4.607143,6.642857,144.186047,15.036339,56.0
2,19.236333,445.178571,4.714286,6.214286,131.818182,15.118182,54.0
3,19.424621,451.125,4.714286,6.321429,134.090909,15.024556,58.0
4,19.121575,445.821429,4.821429,6.160714,127.777778,15.322283,58.0


In [16]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
correlations = []
mse_list = []
fold = 1

for train_index, test_index in kf.split(X):
    # Split the data.
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create and train the linear regression model.
    model = LinearRegression()
    
    model.fit(X_train, y_train)
    
    # Predict on the test set.
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error.
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    
    # Calculate the Pearson correlation coefficient.
    corr, _ = pearsonr(y_test, y_pred)
    correlations.append(corr)
    
    print(f"Fold {fold}: MSE = {mse:.4f}, Correlation coefficient = {corr[0]:.4f}")
    fold += 1

avg_mse = np.mean(mse_list)
avg_corr = np.mean(correlations)

print("\n--- 10-Fold Cross-Validation Results ---")
print(f"Average Mean Squared Error: {avg_mse:.4f}")
print(f"Average Correlation Coefficient (r): {avg_corr:.4f}")

Fold 1: MSE = 0.9126, Correlation coefficient = 0.9592
Fold 2: MSE = 1.2128, Correlation coefficient = 0.9633
Fold 3: MSE = 0.7164, Correlation coefficient = 0.9733
Fold 4: MSE = 1.2960, Correlation coefficient = 0.9641
Fold 5: MSE = 27.9473, Correlation coefficient = 0.3767
Fold 6: MSE = 0.7631, Correlation coefficient = 0.9749
Fold 7: MSE = 1.6017, Correlation coefficient = 0.9508
Fold 8: MSE = 3.6831, Correlation coefficient = 0.9112
Fold 9: MSE = 1.8484, Correlation coefficient = 0.9282
Fold 10: MSE = 1.4003, Correlation coefficient = 0.9170

--- 10-Fold Cross-Validation Results ---
Average Mean Squared Error: 4.1382
Average Correlation Coefficient (r): 0.8919


In [18]:
data

Unnamed: 0,perimeter,area,height,width,shape_index,compactness,weight
0,18.839143,436.151786,4.821429,6.107143,126.666667,15.442772,58.0
1,19.445234,452.437500,4.607143,6.642857,144.186047,15.036339,56.0
2,19.236333,445.178571,4.714286,6.214286,131.818182,15.118182,54.0
3,19.424621,451.125000,4.714286,6.321429,134.090909,15.024556,58.0
4,19.121575,445.821429,4.821429,6.160714,127.777778,15.322283,58.0
...,...,...,...,...,...,...,...
315,18.888907,436.392857,4.714286,6.321429,134.090909,15.369999,58.0
316,18.693004,421.232143,4.660714,6.107143,131.034483,15.148624,54.0
317,19.090194,436.205357,4.607143,6.375000,138.372093,15.041120,58.0
318,19.097808,447.776786,4.767857,6.214286,130.337079,15.427814,60.0


In [None]:
# Example data point (replace with actual values)
new_data_point = pd.DataFrame([[18.839143, 436.151786	, 4.821429	, 6.107143, 126.666667,15.442772	 ]], columns=X.columns)

# Predict the type for the new data point
prediction = model.predict(new_data_point)
prediction

array([[1171.08967671]])