<a href="https://colab.research.google.com/github/FoodPredict/Buckwheat2/blob/main/Buckwheat2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
from IPython.display import display

# Load the dataset, trying a different encoding
try:
    df = pd.read_csv('dataset.csv', encoding='latin-1')
    print("Dataset loaded successfully with latin-1 encoding.")
    # Display the first 5 rows to verify
    display(df.head())
except FileNotFoundError:
    print("Error: 'dataset.csv' not found. Please upload the file to your Colab environment.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully with latin-1 encoding.


Unnamed: 0,Moisture (%),Storage Temp (°C),RH (%),Initial FFA (%),Initial Microbial Count (log CFU/g),Shelf life (days)
0,11.2,25,65,0.35,2.1,120
1,12.5,30,70,0.42,2.5,90
2,13.0,35,80,0.4,2.7,60
3,10.8,20,60,0.3,2.0,150
4,11.5,27,75,0.38,2.2,110


In [31]:
# Inspect data information
print("\nDataset Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Examine descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df.describe())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Moisture (%)                         20 non-null     float64
 1   Storage Temp (°C)                    20 non-null     int64  
 2   RH (%)                               20 non-null     int64  
 3   Initial FFA (%)                      20 non-null     float64
 4   Initial Microbial Count (log CFU/g)  20 non-null     float64
 5   Shelf life (days)                    20 non-null     int64  
dtypes: float64(3), int64(3)
memory usage: 1.1 KB

Missing Values:
Moisture (%)                           0
Storage Temp (°C)                      0
RH (%)                                 0
Initial FFA (%)                        0
Initial Microbial Count (log CFU/g)    0
Shelf life (days)                      0
dtype: int64

Descriptive Statistics:
    

In [32]:
!pip install scikit-learn joblib



In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib # Import joblib to save the scaler

# Separate features (X) and target (y)
# Ensure these column names exactly match your dataset
X = df[['Moisture (%)', 'Storage Temp (°C)', 'RH (%)', 'Initial FFA (%)', 'Initial Microbial Count (log CFU/g)']]
y = df['Shelf life (days)']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the fitted scaler
joblib.dump(scaler, 'scaler.pkl')

print("\nData splitting and scaling complete.")
print(f"Shape of X_train_scaled: {X_train_scaled.shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")
print("\nScaler saved as 'scaler.pkl'")


Data splitting and scaling complete.
Shape of X_train_scaled: (16, 5)
Shape of X_test_scaled: (4, 5)
Shape of y_train: (16,)
Shape of y_test: (4,)

Scaler saved as 'scaler.pkl'


In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib # Import joblib to save the model
import numpy as np # Import numpy for sqrt for RMSE

# Define the model - using RandomForestRegressor
# n_estimators: number of trees in the forest
# random_state: for reproducibility
model = RandomForestRegressor(n_estimators=100, random_state=42)

print("\nStarting model training...")
# Train the model
# Scikit-learn models are trained using the fit method on scaled training data
model.fit(X_train_scaled, y_train)

print("Random Forest Regressor model training complete.")

# Evaluate the model on the test data
print("\nEvaluating model on test data...")
predictions = model.predict(X_test_scaled)

# Calculate performance metrics
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse) # Calculate Root Mean Squared Error
r2 = r2_score(y_test, predictions)

print(f"\nMean Squared Error on Test Data: {mse:.4f}")
print(f"Root Mean Squared Error on Test Data: {rmse:.4f}")
print(f"R-squared Score on Test Data: {r2:.4f}")


# Make predictions on the test data (already done for evaluation, but shown again)
# predictions = model.predict(X_test_scaled) # Already computed above

print("\nPredictions on Test Data (first 10):")
# Display the first 10 predictions for a cleaner output
print(predictions[:10])

# Save the trained model
joblib.dump(model, 'shelf_life_prediction_model_rf.pkl')

print("\nModel saved as 'shelf_life_prediction_model_rf.pkl'")


Starting model training...
Random Forest Regressor model training complete.

Evaluating model on test data...

Mean Squared Error on Test Data: 79.3194
Root Mean Squared Error on Test Data: 8.9061
R-squared Score on Test Data: 0.9029

Predictions on Test Data (first 10):
[119.7  139.85 163.95  82.25]

Model saved as 'shelf_life_prediction_model_rf.pkl'


In [36]:
import os
print("Files and folders in the current directory:")
print(os.listdir())

Files and folders in the current directory:
['.config', 'shelf_life_prediction_model_rf.pkl', 'dataset.csv', 'scaler.pkl', '.ipynb_checkpoints', 'sample_data']


In [37]:
import joblib
import pandas as pd
import numpy as np # Needed if your 'new_data' is a numpy array and needs reshaping

# Define the file names
model_filename = 'shelf_life_prediction_model_rf.pkl'
scaler_filename = 'scaler.pkl'

# Load the saved model and scaler
try:
    loaded_model = joblib.load(model_filename)
    loaded_scaler = joblib.load(scaler_filename)
    print(f"Model '{model_filename}' and Scaler '{scaler_filename}' loaded successfully.")
except FileNotFoundError:
    print(f"Error: Either '{model_filename}' or '{scaler_filename}' not found. Please ensure they are in the correct directory.")
except Exception as e:
    print(f"An error occurred while loading the model or scaler: {e}")

Model 'shelf_life_prediction_model_rf.pkl' and Scaler 'scaler.pkl' loaded successfully.


In [38]:
# Example of new data to make a prediction on
# This should be a 2D array-like structure (e.g., list of lists, pandas DataFrame, or numpy array)
# with the same number and order of features as your training data.
# Let's create an example DataFram similar to your input X
new_data = pd.DataFrame({
    'Moisture (%)': [15.0],
    'Storage Temp (°C)': [25.0],
    'RH (%)': [60.0],
    'Initial FFA (%)': [1.2],
    'Initial Microbial Count (log CFU/g)': [3.5]
})

print("\nOriginal new data for prediction:")
print(new_data)

# Scale the new data using the loaded scaler
# The scaler expects input in the same format as the training data (2D array-like)
scaled_new_data = loaded_scaler.transform(new_data)

print("\nScaled new data for prediction:")
print(scaled_new_data)

# Make a prediction using the loaded model
predicted_shelf_life = loaded_model.predict(scaled_new_data)

print(f"\nPredicted Shelf life (days) for the new data: {predicted_shelf_life[0]:.4f}")


Original new data for prediction:
   Moisture (%)  Storage Temp (°C)  RH (%)  Initial FFA (%)  \
0          15.0               25.0    60.0              1.2   

   Initial Microbial Count (log CFU/g)  
0                                  3.5  

Scaled new data for prediction:
[[ 2.85592024 -0.37736761 -1.05614643 13.08        2.70338684]]

Predicted Shelf life (days) for the new data: 83.7500
