In [1]:
# Wine Cultivar Origin Prediction System
## Model Development
### 1. Import Required Libraries

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

# Model persistence
import joblib

# Warnings
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


In [2]:
### 2. Load the Wine Dataset
# Load wine dataset from sklearn
wine_data = load_wine()

# Create a DataFrame for better visualization
df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)
df['cultivar'] = wine_data.target

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nTarget Distribution:")
print(df['cultivar'].value_counts().sort_index())

Dataset Shape: (178, 14)

First 5 rows:
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_dilu

In [4]:
### 3. Data Preprocessing

#### 3.1 Check for Missing Values

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing values:", df.isnull().sum().sum())

Missing values per column:
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
cultivar                        0
dtype: int64

Total missing values: 0


In [5]:
# 3.2 Feature Selection
# Selected features (6 features as required)
selected_features = [
    'alcohol',
    'malic_acid',
    'total_phenols',
    'flavanoids',
    'color_intensity',
    'proline'
]

# Separate features and target
X = df[selected_features]
y = df['cultivar']

print("Selected Features:")
print(selected_features)
print("\nFeature Matrix Shape:", X.shape)
print("Target Vector Shape:", y.shape)
print("\nFeature Statistics:")
print(X.describe())

Selected Features:
['alcohol', 'malic_acid', 'total_phenols', 'flavanoids', 'color_intensity', 'proline']

Feature Matrix Shape: (178, 6)
Target Vector Shape: (178,)

Feature Statistics:
          alcohol  malic_acid  total_phenols  flavanoids  color_intensity  \
count  178.000000  178.000000     178.000000  178.000000       178.000000   
mean    13.000618    2.336348       2.295112    2.029270         5.058090   
std      0.811827    1.117146       0.625851    0.998859         2.318286   
min     11.030000    0.740000       0.980000    0.340000         1.280000   
25%     12.362500    1.602500       1.742500    1.205000         3.220000   
50%     13.050000    1.865000       2.355000    2.135000         4.690000   
75%     13.677500    3.082500       2.800000    2.875000         6.200000   
max     14.830000    5.800000       3.880000    5.080000        13.000000   

           proline  
count   178.000000  
mean    746.893258  
std     314.907474  
min     278.000000  
25%     500.50

In [6]:
#### 3.3 Train-Test Split
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nTraining set class distribution:")
print(y_train.value_counts().sort_index())
print("\nTesting set class distribution:")
print(y_test.value_counts().sort_index())

Training set size: (142, 6)
Testing set size: (36, 6)

Training set class distribution:
cultivar
0    47
1    57
2    38
Name: count, dtype: int64

Testing set class distribution:
cultivar
0    12
1    14
2    10
Name: count, dtype: int64


In [7]:
# 3.4 Feature Scaling (Mandatory)
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both train and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")
print("\nScaled training data (first 5 rows):")
print(X_train_scaled[:5])
print("\nMean of scaled features (should be ~0):")
print(X_train_scaled.mean(axis=0))
print("\nStd of scaled features (should be ~1):")
print(X_train_scaled.std(axis=0))


Feature scaling completed!

Scaled training data (first 5 rows):
[[ 0.38580089 -0.63787118  0.52686525  0.73229212 -0.16746725  0.46772474]
 [ 0.94851892 -0.76544542  1.17279546  1.33318146  0.30530313  1.81576773]
 [ 0.52335419 -0.51940939  0.93057163  1.006382   -0.081509    1.51620262]
 [ 0.97352861 -0.55585917  0.52686525  0.81662747  0.262324    1.93226527]
 [ 0.43582027  0.82012009 -0.55506784 -1.29175618  1.47433535 -0.29783054]]

Mean of scaled features (should be ~0):
[ 1.09781120e-15 -2.57227729e-16 -2.89283464e-16  2.00152883e-16
  4.45066519e-16 -2.07971355e-16]

Std of scaled features (should be ~1):
[1. 1. 1. 1. 1. 1.]


In [8]:
# 4. Model Training
# Initialize Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees in the forest
    max_depth=10,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples required to split a node
    min_samples_leaf=2,    # Minimum samples required at leaf node
    random_state=42,       # For reproducibility
    n_jobs=-1              # Use all available processors
)

# Train the model
print("Training the Random Forest Classifier...")
model.fit(X_train_scaled, y_train)
print("Model training completed!")

Training the Random Forest Classifier...
Model training completed!


In [9]:
# 5. Model Evaluation
# Make predictions on both training and testing sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("\n" + "="*60)

MODEL PERFORMANCE METRICS

Training Accuracy: 1.0000 (100.00%)
Testing Accuracy:  1.0000 (100.00%)



In [10]:
# Precision, Recall, and F1-Score (Weighted)
precision_weighted = precision_score(y_test, y_test_pred, average='weighted')
recall_weighted = recall_score(y_test, y_test_pred, average='weighted')
f1_weighted = f1_score(y_test, y_test_pred, average='weighted')

print("\nWeighted Metrics:")
print(f"Precision: {precision_weighted:.4f}")
print(f"Recall:    {recall_weighted:.4f}")
print(f"F1-Score:  {f1_weighted:.4f}")


Weighted Metrics:
Precision: 1.0000
Recall:    1.0000
F1-Score:  1.0000


In [11]:
# Precision, Recall, and F1-Score (Macro)
precision_macro = precision_score(y_test, y_test_pred, average='macro')
recall_macro = recall_score(y_test, y_test_pred, average='macro')
f1_macro = f1_score(y_test, y_test_pred, average='macro')

print("\nMacro Metrics:")
print(f"Precision: {precision_macro:.4f}")
print(f"Recall:    {recall_macro:.4f}")
print(f"F1-Score:  {f1_macro:.4f}")


Macro Metrics:
Precision: 1.0000
Recall:    1.0000
F1-Score:  1.0000


In [12]:
# Classification Report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print("\n", classification_report(y_test, y_test_pred, 
                                   target_names=['Cultivar 0', 'Cultivar 1', 'Cultivar 2']))


CLASSIFICATION REPORT

               precision    recall  f1-score   support

  Cultivar 0       1.00      1.00      1.00        12
  Cultivar 1       1.00      1.00      1.00        14
  Cultivar 2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [13]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)
print("\n", cm)
print("\nRows represent actual classes, columns represent predicted classes")


CONFUSION MATRIX

 [[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]

Rows represent actual classes, columns represent predicted classes


In [14]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n" + "="*60)
print("FEATURE IMPORTANCE")
print("="*60)
print("\n", feature_importance)


FEATURE IMPORTANCE

            feature  importance
4  color_intensity    0.267241
3       flavanoids    0.252725
5          proline    0.189506
0          alcohol    0.136530
2    total_phenols    0.095098
1       malic_acid    0.058900


In [15]:
# 6. Save the Model and Scaler
# Create a dictionary containing the model, scaler, and feature names
model_package = {
    'model': model,
    'scaler': scaler,
    'features': selected_features,
    'accuracy': test_accuracy,
    'algorithm': 'Random Forest Classifier'
}

# Save the model package
joblib.dump(model_package, 'wine_cultivar_model.pkl')

print("\n" + "="*60)
print("MODEL SAVED SUCCESSFULLY!")
print("="*60)
print("\nModel saved as: wine_cultivar_model.pkl")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Algorithm: Random Forest Classifier")
print(f"Features used: {selected_features}")


MODEL SAVED SUCCESSFULLY!

Model saved as: wine_cultivar_model.pkl
Test Accuracy: 1.0000
Algorithm: Random Forest Classifier
Features used: ['alcohol', 'malic_acid', 'total_phenols', 'flavanoids', 'color_intensity', 'proline']


In [16]:
# 7. Test the Saved Model
# Load the saved model
loaded_package = joblib.load('wine_cultivar_model.pkl')
loaded_model = loaded_package['model']
loaded_scaler = loaded_package['scaler']
loaded_features = loaded_package['features']

print("Model loaded successfully!")
print(f"\nFeatures: {loaded_features}")
print(f"Algorithm: {loaded_package['algorithm']}")
print(f"Saved Accuracy: {loaded_package['accuracy']:.4f}")

Model loaded successfully!

Features: ['alcohol', 'malic_acid', 'total_phenols', 'flavanoids', 'color_intensity', 'proline']
Algorithm: Random Forest Classifier
Saved Accuracy: 1.0000


In [17]:
# Test with a sample prediction
sample_data = X_test.iloc[0:1]
sample_scaled = loaded_scaler.transform(sample_data)
prediction = loaded_model.predict(sample_scaled)
actual = y_test.iloc[0]

print("\n" + "="*60)
print("SAMPLE PREDICTION TEST")
print("="*60)
print(f"\nSample Input:")
print(sample_data)
print(f"\nPredicted Cultivar: {prediction[0]}")
print(f"Actual Cultivar: {actual}")
print(f"Match: {'✓ Correct' if prediction[0] == actual else '✗ Incorrect'}")


SAMPLE PREDICTION TEST

Sample Input:
    alcohol  malic_acid  total_phenols  flavanoids  color_intensity  proline
10     14.1        2.16           2.95        3.32             5.75   1510.0

Predicted Cultivar: 0
Actual Cultivar: 0
Match: ✓ Correct
