In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

data = pd.read_csv('datasets/seeds.csv')
X = data[['Area', 'Perimeter', 'Compactness', 'Kernel.Length', 'Kernel.Width', 'Asymmetry.Coeff', 'Kernel.Groove']] 
y = data[['Type']]
data.head()

Unnamed: 0,Area,Perimeter,Compactness,Kernel.Length,Kernel.Width,Asymmetry.Coeff,Kernel.Groove,Type
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
correlations = []
mse_list = []
fold = 1

for train_index, test_index in kf.split(X):
    # Split the data.
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create and train the linear regression model.
    model = LinearRegression()
    
    model.fit(X_train, y_train)
    
    # Predict on the test set.
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error.
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    
    # Calculate the Pearson correlation coefficient.
    corr, _ = pearsonr(y_test, y_pred)
    correlations.append(corr)
    
    print(f"Fold {fold}: MSE = {mse:.4f}, Correlation coefficient = {corr[0]:.4f}")
    fold += 1

avg_mse = np.mean(mse_list)
avg_corr = np.mean(correlations)

print("\n--- 10-Fold Cross-Validation Results ---")
print(f"Average Mean Squared Error: {avg_mse:.4f}")
print(f"Average Correlation Coefficient (r): {avg_corr:.4f}")

Fold 1: MSE = 0.1560, Correlation coefficient = 0.8613
Fold 2: MSE = 0.2215, Correlation coefficient = 0.7571
Fold 3: MSE = 0.1545, Correlation coefficient = 0.8881
Fold 4: MSE = 0.1402, Correlation coefficient = 0.8922
Fold 5: MSE = 0.2199, Correlation coefficient = 0.8447
Fold 6: MSE = 0.1154, Correlation coefficient = 0.9267
Fold 7: MSE = 0.1257, Correlation coefficient = 0.9027
Fold 8: MSE = 0.1263, Correlation coefficient = 0.8982
Fold 9: MSE = 0.2825, Correlation coefficient = 0.8174
Fold 10: MSE = 0.1341, Correlation coefficient = 0.8251

--- 10-Fold Cross-Validation Results ---
Average Mean Squared Error: 0.1676
Average Correlation Coefficient (r): 0.8614


In [12]:
# Example data point (replace with actual values)
new_data_point = pd.DataFrame([[15.0, 14.5, 0.87, 5.5, 3.3, 2.2, 5.0]], columns=X.columns)

# Predict the type for the new data point
prediction = model.predict(new_data_point)
predicted_class = int(np.round(prediction[0][0]))
predicted_class = max(1, min(predicted_class, 3))

print(f"Predicted type: {predicted_class}")

Predicted type: 2
