# Import Libraries


In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Dataset
url = https://www.kaggle.com/datasets/fatihb/coffee-quality-data-cqi

# Data Exploration and Preprocessing

In [119]:

coffee_data = pd.read_csv("Dataset/df_arabica_clean.csv")
coffee_data.shape

(207, 41)

In [120]:
coffee_data.head()

Unnamed: 0.1,Unnamed: 0,ID,Country of Origin,Farm Name,Lot Number,Mill,ICO Number,Company,Altitude,Region,Producer,Number of Bags,Bag Weight,In-Country Partner,Harvest Year,Grading Date,Owner,Variety,Status,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Quakers,Color,Category Two Defects,Expiration,Certification Body,Certification Address,Certification Contact
0,0,0,Colombia,Finca El Paraiso,CQU2022015,Finca El Paraiso,,Coffee Quality Union,1700-1930,"Piendamo,Cauca",Diego Samuel Bermudez,1,35 kg,Japan Coffee Exchange,2021 / 2022,"September 21st, 2022",Coffee Quality Union,Castillo,Completed,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,10.0,10.0,10.0,8.58,0.0,89.33,11.8,0,0,green,3,"September 21st, 2023",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
1,1,1,Taiwan,Royal Bean Geisha Estate,"The 2022 Pacific Rim Coffee Summit,T037",Royal Bean Geisha Estate,,Taiwan Coffee Laboratory,1200,Chiayi,曾福森,1,80 kg,Taiwan Coffee Laboratory 台灣咖啡研究室,2021 / 2022,"November 15th, 2022",Taiwan Coffee Laboratory 台灣咖啡研究室,Gesha,Completed,Washed / Wet,8.5,8.5,7.92,8.0,7.92,8.25,10.0,10.0,10.0,8.5,0.0,87.58,10.5,0,0,blue-green,0,"November 15th, 2023",Taiwan Coffee Laboratory 台灣咖啡研究室,"QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd...","Lin, Jen-An Neil 林仁安 - 886-289116612"
2,2,2,Laos,OKLAO coffee farms,"The 2022 Pacific Rim Coffee Summit,LA01",oklao coffee processing plant,,Taiwan Coffee Laboratory,1300,Laos Borofen Plateau,WU TAO CHI,19,25 kg,Taiwan Coffee Laboratory 台灣咖啡研究室,2021 / 2022,"November 15th, 2022",Taiwan Coffee Laboratory 台灣咖啡研究室,Java,Completed,Semi Washed,8.33,8.42,8.08,8.17,7.92,8.17,10.0,10.0,10.0,8.33,0.0,87.42,10.4,0,0,yellowish,2,"November 15th, 2023",Taiwan Coffee Laboratory 台灣咖啡研究室,"QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd...","Lin, Jen-An Neil 林仁安 - 886-289116612"
3,3,3,Costa Rica,La Cumbre,CQU2022017,La Montana Tarrazu MIll,,Coffee Quality Union,1900,"Los Santos,Tarrazu",Santa Maria de Dota,1,22 kg,Japan Coffee Exchange,2022,"September 21st, 2022",Coffee Quality Union,Gesha,Completed,Washed / Wet,8.08,8.17,8.17,8.25,8.17,8.08,10.0,10.0,10.0,8.25,0.0,87.17,11.8,0,0,green,0,"September 21st, 2023",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
4,4,4,Colombia,Finca Santuario,CQU2023002,Finca Santuario,,Coffee Quality Union,1850-2100,"Popayan,Cauca",Camilo Merizalde,2,24 kg,Japan Coffee Exchange,2022,"March 6th, 2023",Coffee Quality Union,Red Bourbon,Completed,"Honey,Mossto",8.33,8.33,8.08,8.25,7.92,7.92,10.0,10.0,10.0,8.25,0.0,87.08,11.6,0,2,yellow-green,2,"March 5th, 2024",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901


In [121]:
# Check for missing values
coffee_data.isnull().sum()

Unnamed: 0                 0
ID                         0
Country of Origin          0
Farm Name                  2
Lot Number                 1
Mill                       3
ICO Number               132
Company                    0
Altitude                   1
Region                     2
Producer                   1
Number of Bags             0
Bag Weight                 0
In-Country Partner         0
Harvest Year               0
Grading Date               0
Owner                      0
Variety                    6
Status                     0
Processing Method          5
Aroma                      0
Flavor                     0
Aftertaste                 0
Acidity                    0
Body                       0
Balance                    0
Uniformity                 0
Clean Cup                  0
Sweetness                  0
Overall                    0
Defects                    0
Total Cup Points           0
Moisture Percentage        0
Category One Defects       0
Quakers       

In [123]:
# Drop unnecessary columns
coffee_data.drop(coffee_data.columns[[0, 1, 4, 6, 7, 8, 14, 18, 26, 27, 28, 34]], axis=1, inplace=True)

In [124]:
# Fill missing values
coffee_data['Farm Name'] = coffee_data['Farm Name'].fillna('none')
coffee_data['Mill'] = coffee_data['Mill'].fillna('none')
coffee_data['Region'] = coffee_data['Region'].fillna('none')
coffee_data['Producer'] = coffee_data['Producer'].fillna('none')
coffee_data['Variety'] = coffee_data['Variety'].fillna('none')
coffee_data['Processing Method'] = coffee_data['Processing Method'].fillna('none')

# Check for missing values
print(coffee_data.isna().sum())

Country of Origin        0
Farm Name                0
Mill                     0
Region                   0
Producer                 0
Number of Bags           0
Bag Weight               0
In-Country Partner       0
Grading Date             0
Owner                    0
Variety                  0
Processing Method        0
Aroma                    0
Flavor                   0
Aftertaste               0
Acidity                  0
Body                     0
Balance                  0
Overall                  0
Defects                  0
Total Cup Points         0
Moisture Percentage      0
Category One Defects     0
Color                    0
Category Two Defects     0
Expiration               0
Certification Body       0
Certification Address    0
Certification Contact    0
dtype: int64


In [125]:
# Display the first few rows of the cleaned dataset
print(coffee_data.head())

  Country of Origin  ...                    Certification Contact
0          Colombia  ...  松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
1            Taiwan  ...     Lin, Jen-An Neil 林仁安 - 886-289116612
2              Laos  ...     Lin, Jen-An Neil 林仁安 - 886-289116612
3        Costa Rica  ...  松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
4          Colombia  ...  松澤　宏樹　Koju Matsuzawa - +81(0)9085642901

[5 rows x 29 columns]


In [126]:
# Drop rows with missing values
coffee_data.dropna(inplace=True)
coffee_data.isnull().sum()

Country of Origin        0
Farm Name                0
Mill                     0
Region                   0
Producer                 0
Number of Bags           0
Bag Weight               0
In-Country Partner       0
Grading Date             0
Owner                    0
Variety                  0
Processing Method        0
Aroma                    0
Flavor                   0
Aftertaste               0
Acidity                  0
Body                     0
Balance                  0
Overall                  0
Defects                  0
Total Cup Points         0
Moisture Percentage      0
Category One Defects     0
Color                    0
Category Two Defects     0
Expiration               0
Certification Body       0
Certification Address    0
Certification Contact    0
dtype: int64

In [127]:
# Convert 'Bag Weight' to numeric by removing 'kg' and converting to float
coffee_data['Bag Weight'] = coffee_data['Bag Weight'].str.replace(' kg', '').astype('float')

In [128]:
coffee_data.head()

Unnamed: 0,Country of Origin,Farm Name,Mill,Region,Producer,Number of Bags,Bag Weight,In-Country Partner,Grading Date,Owner,Variety,Processing Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Overall,Defects,Total Cup Points,Moisture Percentage,Category One Defects,Color,Category Two Defects,Expiration,Certification Body,Certification Address,Certification Contact
0,Colombia,Finca El Paraiso,Finca El Paraiso,"Piendamo,Cauca",Diego Samuel Bermudez,1,35.0,Japan Coffee Exchange,"September 21st, 2022",Coffee Quality Union,Castillo,Double Anaerobic Washed,8.58,8.5,8.42,8.58,8.25,8.42,8.58,0.0,89.33,11.8,0,green,3,"September 21st, 2023",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
1,Taiwan,Royal Bean Geisha Estate,Royal Bean Geisha Estate,Chiayi,曾福森,1,80.0,Taiwan Coffee Laboratory 台灣咖啡研究室,"November 15th, 2022",Taiwan Coffee Laboratory 台灣咖啡研究室,Gesha,Washed / Wet,8.5,8.5,7.92,8.0,7.92,8.25,8.5,0.0,87.58,10.5,0,blue-green,0,"November 15th, 2023",Taiwan Coffee Laboratory 台灣咖啡研究室,"QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd...","Lin, Jen-An Neil 林仁安 - 886-289116612"
2,Laos,OKLAO coffee farms,oklao coffee processing plant,Laos Borofen Plateau,WU TAO CHI,19,25.0,Taiwan Coffee Laboratory 台灣咖啡研究室,"November 15th, 2022",Taiwan Coffee Laboratory 台灣咖啡研究室,Java,Semi Washed,8.33,8.42,8.08,8.17,7.92,8.17,8.33,0.0,87.42,10.4,0,yellowish,2,"November 15th, 2023",Taiwan Coffee Laboratory 台灣咖啡研究室,"QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd...","Lin, Jen-An Neil 林仁安 - 886-289116612"
3,Costa Rica,La Cumbre,La Montana Tarrazu MIll,"Los Santos,Tarrazu",Santa Maria de Dota,1,22.0,Japan Coffee Exchange,"September 21st, 2022",Coffee Quality Union,Gesha,Washed / Wet,8.08,8.17,8.17,8.25,8.17,8.08,8.25,0.0,87.17,11.8,0,green,0,"September 21st, 2023",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901
4,Colombia,Finca Santuario,Finca Santuario,"Popayan,Cauca",Camilo Merizalde,2,24.0,Japan Coffee Exchange,"March 6th, 2023",Coffee Quality Union,Red Bourbon,"Honey,Mossto",8.33,8.33,8.08,8.25,7.92,7.92,8.25,0.0,87.08,11.6,0,yellow-green,2,"March 5th, 2024",Japan Coffee Exchange,"〒413-0002 静岡県熱海市伊豆山１１７３−５８ 1173-58 Izusan, Ata...",松澤　宏樹　Koju Matsuzawa - +81(0)9085642901


# Feature Engineering
Prepare the data for modeling by encoding categorical variables and splitting the data into features and target.

In [129]:
# Encode categorical variables
coffee_data['Country of Origin'] = coffee_data['Country of Origin'].astype('category').cat.codes
coffee_data['Farm Name'] = coffee_data['Farm Name'].astype('category').cat.codes
coffee_data['Mill'] = coffee_data['Mill'].astype('category').cat.codes
coffee_data['Region'] = coffee_data['Region'].astype('category').cat.codes
coffee_data['Producer'] = coffee_data['Producer'].astype('category').cat.codes
coffee_data['In-Country Partner'] = coffee_data['In-Country Partner'].astype('category').cat.codes
coffee_data['Grading Date'] = pd.to_datetime(coffee_data['Grading Date'], errors='coerce').astype(int) / 10**9  # Convert to numeric timestamp
coffee_data['Owner'] = coffee_data['Owner'].astype('category').cat.codes
coffee_data['Variety'] = coffee_data['Variety'].astype('category').cat.codes
coffee_data['Processing Method'] = coffee_data['Processing Method'].astype('category').cat.codes
coffee_data['Color'] = coffee_data['Color'].astype('category').cat.codes
coffee_data['Expiration'] = pd.to_datetime(coffee_data['Expiration'], errors='coerce').astype(int) / 10**9  # Convert to numeric timestamp
coffee_data['Certification Body'] = coffee_data['Certification Body'].astype('category').cat.codes
coffee_data['Certification Address'] = coffee_data['Certification Address'].astype('category').cat.codes
coffee_data['Certification Contact'] = coffee_data['Certification Contact'].astype('category').cat.codes

# Define Features and Target

In [130]:
# Define features and target
X = coffee_data.drop('Total Cup Points', axis=1)
y = coffee_data['Total Cup Points']

# Binarize the target variable for classification (e.g., high quality vs. low quality)
y = (y >= 85).astype(int)  # Assuming 85 as the threshold for high quality

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model

In [131]:
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)


In [142]:
y_predict = rf.predict(X_train)
accuracy_train = accuracy_score(y_train, y_predict)


# Evaluate the model
print(f"Accuracy_train: {accuracy_train * 100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_predict))
print("Classification Report:\n", classification_report(y_train, y_predict))

Accuracy_train: 100.00%
Confusion Matrix:
 [[124   0]
 [  0  41]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       124
           1       1.00      1.00      1.00        41

    accuracy                           1.00       165
   macro avg       1.00      1.00      1.00       165
weighted avg       1.00      1.00      1.00       165



In [143]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix
y_predict = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_predict)
conf_matrix = confusion_matrix(y_test, y_predict)

# Evaluate the model
print(f"Accuracy_train: {accuracy_test * 100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_predict))
print("Classification Report:\n", classification_report(y_test, y_predict))

Accuracy_train: 100.00%
Confusion Matrix:
 [[35  0]
 [ 0  7]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00         7

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

