In [11]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
import requests
import numpy as np


## Step 2: Load Dataset

In [13]:
# Load the dataset (replace with actual file path)
data = pd.read_csv("df_arabica_clean.csv")

# Show a preview of the data (uncomment if needed)
data.columns


Index(['Unnamed: 0', 'ID', 'Country of Origin', 'Farm Name', 'Lot Number',
       'Mill', 'ICO Number', 'Company', 'Altitude', 'Region', 'Producer',
       'Number of Bags', 'Bag Weight', 'In-Country Partner', 'Harvest Year',
       'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method',
       'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects',
       'Total Cup Points', 'Moisture Percentage', 'Category One Defects',
       'Quakers', 'Color', 'Category Two Defects', 'Expiration',
       'Certification Body', 'Certification Address', 'Certification Contact'],
      dtype='object')

## Step 3: Define Features and Target Variable

In [14]:
# Define the features (independent variables) and target (dependent variable)
# data.columns = data.columns.str.lower().str.replace(' ', '_')

features = ["Aroma", "Flavor", "Aftertaste", "Acidity", "Body", "Balance", "Uniformity", "Clean Cup", "Sweetness"]
target = "Total Cup Points"  

# Keep only relevant columns in the dataset
data = data[features + [target]]
data


Unnamed: 0,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Total Cup Points
0,8.58,8.50,8.42,8.58,8.25,8.42,10.0,10.0,10.0,89.33
1,8.50,8.50,7.92,8.00,7.92,8.25,10.0,10.0,10.0,87.58
2,8.33,8.42,8.08,8.17,7.92,8.17,10.0,10.0,10.0,87.42
3,8.08,8.17,8.17,8.25,8.17,8.08,10.0,10.0,10.0,87.17
4,8.33,8.33,8.08,8.25,7.92,7.92,10.0,10.0,10.0,87.08
...,...,...,...,...,...,...,...,...,...,...
202,7.17,7.17,6.92,7.17,7.42,7.17,10.0,10.0,10.0,80.08
203,7.33,7.08,6.75,7.17,7.42,7.17,10.0,10.0,10.0,80.00
204,7.25,7.17,7.08,7.00,7.08,7.08,10.0,10.0,10.0,79.67
205,6.50,6.75,6.75,7.17,7.08,7.00,10.0,10.0,10.0,78.08


### Step 4: Handle Missing Values

In [15]:
data.fillna(data.mean(), inplace=True)



### Step 5: Split Data into Features and Target

In [16]:
# Split data into features X, and target y

X = data[features]
y = data[target]


### Step 6: Normalize the Data

In [17]:
# Normalize data using StandardScaler 
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

### Step 7: Split the Data into Training and Testing Sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=42)

In [19]:
import numpy as np

# Check for NaN values in X_train
print(np.isnan(X_train).sum())  

# Check for NaN values in y_train
print(np.isnan(y_train).sum())  


0
0


### Step 8: Train the Model with Hyperparameter Tuning

In [20]:
# Initialise the XGBoost model
xgb_model  = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Set hyperparameters for tuning
param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'subsample': [0.7, 1.0]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(xgb_model, param_grid, cv=2, scoring="neg_mean_squared_error", n_jobs=2, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model after tuning
best_xgb = grid_search.best_estimator_

Fitting 2 folds for each of 18 candidates, totalling 36 fits


### Step 9: Evaluate the Model and Save It

In [21]:
# Evaluate the model on the test data
y_pred = best_xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Optimized XGBoost Model MSE: {mse}")

# Save the trained model and the scaler for future use
joblib.dump(best_xgb, "coffee_xgboost_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model and Scaler saved successfully!")


Optimized XGBoost Model MSE: 0.1099621414331278
Model and Scaler saved successfully!


In [22]:
# import os
# os.remove("coffee_xgboost_model.pkl")
# os.remove("scaler.pkl")

## Step 10: Fetch Data from API and Make Predictions


### Step 10.1: Load the Trained Model and Scaler

In [23]:
# load trained model and scaler from the disk
model = joblib.load("coffee_xgboost_model.pkl")
scaler = joblib.load('scaler.pkl')

### Step 10.2: Fetch the Latest Data from API

In [24]:
# API URL to fetch the latest data (replace with actual API endpoint)
API_URL = "https://api-endpoint-99oz.onrender.com/api/coffees/latest/"

# Send a request to fetch the latest data
response = requests.get(API_URL)

# Check if the request was successful
if response.status_code == 200:
    latest_data = response.json()  # Convert API response to JSON
    print(latest_data)
else:
    print("Failed to fetch data")
    exit()  # Exit the program if data retrieval fails


{'producer_id': 1, 'harvest_year': 2022, 'grading_date': '2022-09-21', 'variety': 'Gesha', 'processing_method': 'Washed / Wet', 'aroma': 8.08, 'flavor': 8.17, 'aftertaste': 8.17, 'acidity': 8.25, 'body': 8.17, 'balance': 8.08, 'uniformity': 10.0, 'clean_cup': 10.0, 'sweetness': 10.0, 'moisture_percentage': 11.8, 'category': 'Arabica', 'quakers': 0, 'color': 'green', 'coffee_id': 1, 'total_cup_points': 78.92, 'quality_classification': 'Premium', 'created_at': '2025-03-13T19:39:48.424299', 'updated_at': '2025-03-13T19:39:48.424299'}


### Step 10.3: Convert API Response into a DataFrame

In [25]:
import pandas as pd

# Print the structure of `latest_data` to check if it contains the correct feature names
print(latest_data)

# Convert the API response into a pandas DataFrame
df = pd.DataFrame([latest_data])  # Assuming latest_data is a list of dictionaries

# Print column names to verify if the features exist
print(df.columns)

# Ensure that the 'features' list matches the columns of the DataFrame
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean Cup', 'Sweetness']

# Select only the relevant features for prediction
# You can also add a check here to see if all features exist in the DataFrame
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Warning: The following features are missing in the DataFrame: {missing_features}")

# Select only available features
df = df[[f for f in features if f in df.columns]]

print(df)


{'producer_id': 1, 'harvest_year': 2022, 'grading_date': '2022-09-21', 'variety': 'Gesha', 'processing_method': 'Washed / Wet', 'aroma': 8.08, 'flavor': 8.17, 'aftertaste': 8.17, 'acidity': 8.25, 'body': 8.17, 'balance': 8.08, 'uniformity': 10.0, 'clean_cup': 10.0, 'sweetness': 10.0, 'moisture_percentage': 11.8, 'category': 'Arabica', 'quakers': 0, 'color': 'green', 'coffee_id': 1, 'total_cup_points': 78.92, 'quality_classification': 'Premium', 'created_at': '2025-03-13T19:39:48.424299', 'updated_at': '2025-03-13T19:39:48.424299'}
Index(['producer_id', 'harvest_year', 'grading_date', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'uniformity', 'clean_cup', 'sweetness',
       'moisture_percentage', 'category', 'quakers', 'color', 'coffee_id',
       'total_cup_points', 'quality_classification', 'created_at',
       'updated_at'],
      dtype='object')
Empty DataFrame
Columns: []
Index: [0]


### Step 10.4: Handle Missing Values

In [26]:
# Fill missing values with the mean of each column
df.fillna(df.mean(), inplace=True)


Step 10.5: Scale the Input Features

In [27]:
# Standardize column names
df.columns = df.columns.str.title().str.replace("_", " ")

# Ensure all required features exist
for col in scaler.feature_names_in_:
    if col not in df:
        df[col] = 0

# Reorder and transform
df = df[scaler.feature_names_in_].astype(float)
df_scaled = scaler.transform(df)


### Step 10.6: Make a Prediction

In [28]:
# Use the trained model to make a prediction
prediction = model.predict(df_scaled)

# Display the predicted coffee quality score
print(f"Predicted Coffee Quality Score: {prediction[0]}")


Predicted Coffee Quality Score: 78.05490112304688
