In [15]:
import pandas as pd
import numpy as np
import hyperopt as hp
from sklearn.linear_model import LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split


In [18]:
spoilage_data = pd.read_csv(r"fully_combined_project_data.csv")

# Create the mapping
type_map = {'pork': 0, 'poultry': 1}
spoilage_data['meat_type'] = spoilage_data['EnvType'].map(type_map)


# Drop columns that contain nan values
spoilage_data = spoilage_data.dropna(axis=1)

# Select Columns that are numeric
num_spoilage_data = spoilage_data.select_dtypes(include=np.number)

num_spoilage_data['earlyvlatespoilage'] = (num_spoilage_data['Total mesophilic aerobic flora (log10 CFU.g-1)'] >= 7).astype(int)

display(num_spoilage_data[['Total mesophilic aerobic flora (log10 CFU.g-1)', 'earlyvlatespoilage']])

X =  num_spoilage_data.drop('earlyvlatespoilage', axis=1)
Y = num_spoilage_data['earlyvlatespoilage']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25 , random_state=100, shuffle=True)

Unnamed: 0,Total mesophilic aerobic flora (log10 CFU.g-1),earlyvlatespoilage
0,6.00,0
1,5.90,0
2,6.11,0
3,8.44,1
4,8.44,1
...,...,...
430,8.11,1
431,8.32,1
432,8.81,1
433,8.64,1


### Hyperparameter tuning code

In [19]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import numpy as np

# Define the objective function to minimize
def objective(params):
    model = ElasticNetCV(cv=10, l1_ratio=params['l1_ratio'], max_iter=int(params['max_iter']), random_state=0 ).fit(X, Y) #Using X Y because we are using cross validation


    # Calculate the loss as the minimum of the mean MSE across all alpha values
    loss = np.min(np.mean(model.mse_path_, axis=1))

    return {'loss': loss, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'l1_ratio': hp.uniform('l1_ratio', 0.0, 1.0),  # Uniform distribution for x between -10 and 10
    'max_iter': hp.quniform('max_iter', 1000, 10000, 1000)
}

# Run the optimization
trials = Trials()  # Store results of the evaluations
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,  # Tree-structured Parzen Estimator algorithm
    max_evals=100,      # Number of evaluations to perform
    trials=trials
)

print("Best hyperparameters found:", best)
print("Minimum loss achieved:", trials.best_trial['result']['loss'])

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [01:22<00:00,  1.21trial/s, best loss: 0.10111924754905999]
Best hyperparameters found: {'l1_ratio': np.float64(2.6124437766479082e-05), 'max_iter': np.float64(3000.0)}
Minimum loss achieved: 0.10111924754905999


In [20]:
reg = ElasticNetCV(cv=10, l1_ratio=best['l1_ratio'], max_iter=int(best['max_iter']), random_state=0).fit(X, Y)

print(f"Optimal alpha: {reg.alpha_}")
print(f"R-squared score on training data: {reg.score(X, Y):.3f}")


coefficients_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': reg.coef_})

Optimal alpha: 465773.66174480127
R-squared score on training data: 0.305


In [24]:
coefficients_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': reg.coef_})

display(coefficients_df.sort_values(by='Coefficient', ascending=False))
# coefficients_df.to_csv("lassoCV_coefs.csv")

Unnamed: 0,Feature,Coefficient
352,Shewanella,0.000016
209,Lactococcus,0.000014
393,Vagococcus,0.000012
207,Lactiplantibacillus,0.000011
205,Lacticaseibacillus,0.000011
...,...,...
250,Morganella,-0.000009
345,Salinivibrio,-0.000010
288,Peptostreptococcus,-0.000014
54,Bacillus,-0.000017
