In [11]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm


In [12]:
import pandas as pd
import os

csv_path = "C:\\Users\\Aagaaz Kapoor\\Desktop\\CT\\final project\\train.csv"
image_folder = r"C:\Users\Aagaaz Kapoor\Desktop\CT\images"  

df = pd.read_csv(csv_path)
print(df.head())


  image_id  healthy  multiple_diseases  rust  scab
0  Train_0        0                  0     0     1
1  Train_1        0                  1     0     0
2  Train_2        1                  0     0     0
3  Train_3        0                  0     1     0
4  Train_4        1                  0     0     0


In [13]:
image_size = (128, 128)
# Store image data
image_data = []

for img_id in df['image_id']:
    img_path = os.path.join(image_folder, f"{img_id}.jpg")
    
    # Read image using OpenCV
    img = cv2.imread(img_path)
    
    if img is not None:
        # Resize image
        img = cv2.resize(img, image_size)
        # Optionally convert to grayscale
        # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        image_data.append(img)
    else:
        print(f"Warning: {img_path} not found or couldn't be read.")

In [15]:
from skimage.feature import hog
import cv2

In [16]:
def extract_features(img, use_hog=True, use_color_hist=True):
    features = []

    # Resize image to ensure consistency (already done before calling this)
    # Convert to grayscale for HOG
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    if use_hog:
        # HOG feature extraction
        hog_features = hog(
            gray,
            orientations=9,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            block_norm='L2-Hys',
            transform_sqrt=True,
            feature_vector=True
        )
        features.extend(hog_features)
    
    if use_color_hist:
        # Color Histogram (in RGB)
        chans = cv2.split(img)
        hist_features = []
        for chan in chans:
            hist = cv2.calcHist([chan], [0], None, [32], [0, 256])
            hist = cv2.normalize(hist, hist).flatten()
            hist_features.extend(hist)
        features.extend(hist_features)
    
    return features


In [17]:
# Assume `image_data` is already populated with resized images
X_features = []

for i, img in enumerate(image_data):
    feats = extract_features(img)
    X_features.append(feats)

X_features = np.array(X_features)
print("Feature matrix shape:", X_features.shape)


Feature matrix shape: (1821, 8196)


In [18]:
# One-hot columns: ['healthy', 'multiple_diseases', 'rust', 'scab']
# Convert to single label
labels = df[['healthy', 'multiple_diseases', 'rust', 'scab']].idxmax(axis=1)
labels = labels.map({
    'healthy': 0,
    'multiple_diseases': 1,
    'rust': 2,
    'scab': 3
})
y = labels.values


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [20]:
# X_features and y are assumed from the previous step
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)


In [21]:
scaler = StandardScaler()

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the same scaler
X_test_scaled = scaler.transform(X_test)


In [23]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb  # Make sure to install it using: pip install xgboost


In [25]:
# SVM
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [26]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)  # No scaling needed for Random Forest

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# Gradient Boosting (XGBoost)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [27]:
models = {
    'SVM': svm_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model
}

for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Select test set (scaled for SVM, raw for tree models)
    X_test_used = X_test_scaled if name == 'SVM' else X_test
    
    y_pred = model.predict(X_test_used)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



=== SVM ===
Accuracy: 0.5013698630136987
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.43      0.47       103
           1       0.00      0.00      0.00        18
           2       0.47      0.58      0.52       125
           3       0.52      0.55      0.54       119

    accuracy                           0.50       365
   macro avg       0.38      0.39      0.38       365
weighted avg       0.48      0.50      0.49       365

Confusion Matrix:
 [[44  0 36 23]
 [ 5  0  7  6]
 [20  0 73 32]
 [14  1 38 66]]

=== Random Forest ===
Accuracy: 0.5013698630136987
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.41      0.48       103
           1       0.00      0.00      0.00        18
           2       0.46      0.63      0.53       125
           3       0.51      0.52      0.52       119

    accuracy                           0.50       365
   macro avg     

In [28]:
import joblib

# Save the SVM model
joblib.dump(svm_model, 'svm_model.pkl')

# Also save the scaler used (important for preprocessing test data)
joblib.dump(scaler, 'svm_scaler.pkl')

print("✅ SVM model and scaler saved.")


✅ SVM model and scaler saved.


In [29]:
# Load the saved model and scaler
svm_model = joblib.load('svm_model.pkl')
scaler = joblib.load('svm_scaler.pkl')

print("✅ SVM model and scaler loaded.")


✅ SVM model and scaler loaded.


In [30]:
import cv2
import os
import numpy as np

def extract_features(img):
    features = []

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # HOG
    from skimage.feature import hog
    hog_features = hog(
        gray,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm='L2-Hys',
        transform_sqrt=True,
        feature_vector=True
    )
    features.extend(hog_features)

    # Color Histogram (in RGB)
    chans = cv2.split(img)
    for chan in chans:
        hist = cv2.calcHist([chan], [0], None, [32], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()
        features.extend(hist)

    return features


In [33]:
test_image_folder = r"C:\Users\Aagaaz Kapoor\Desktop\CT\images"
image_size = (128, 128)

# Prepare feature list and filenames
X_test_final = []
image_ids = []

for filename in os.listdir(test_image_folder):
    if filename.endswith(".jpg"):
        img_path = os.path.join(test_image_folder, filename)
        img = cv2.imread(img_path)

        if img is not None:
            img = cv2.resize(img, image_size)
            features = extract_features(img)
            X_test_final.append(features)
            image_ids.append(filename.split('.')[0])  # remove .jpg
        else:
            print(f"❌ Failed to read {filename}")

# Convert to NumPy and scale
X_test_final = scaler.transform(np.array(X_test_final))

# Predict
y_pred = svm_model.predict(X_test_final)

# Map numeric label back to category
label_map = {0: 'healthy', 1: 'multiple_diseases', 2: 'rust', 3: 'scab'}
predictions = [label_map[i] for i in y_pred]

# Show results
for img_id, pred in zip(image_ids, predictions):
    print(f"{img_id}: {pred}")


Test_0: scab
Test_1: rust
Test_10: rust
Test_100: rust
Test_1000: scab
Test_1001: healthy
Test_1002: rust
Test_1003: rust
Test_1004: rust
Test_1005: healthy
Test_1006: scab
Test_1007: scab
Test_1008: scab
Test_1009: healthy
Test_101: healthy
Test_1010: rust
Test_1011: healthy
Test_1012: rust
Test_1013: rust
Test_1014: rust
Test_1015: scab
Test_1016: scab
Test_1017: rust
Test_1018: scab
Test_1019: scab
Test_102: rust
Test_1020: healthy
Test_1021: healthy
Test_1022: rust
Test_1023: healthy
Test_1024: scab
Test_1025: rust
Test_1026: rust
Test_1027: scab
Test_1028: healthy
Test_1029: rust
Test_103: healthy
Test_1030: rust
Test_1031: scab
Test_1032: scab
Test_1033: healthy
Test_1034: healthy
Test_1035: rust
Test_1036: rust
Test_1037: healthy
Test_1038: scab
Test_1039: rust
Test_104: scab
Test_1040: scab
Test_1041: scab
Test_1042: rust
Test_1043: rust
Test_1044: rust
Test_1045: scab
Test_1046: scab
Test_1047: healthy
Test_1048: scab
Test_1049: rust
Test_105: rust
Test_1050: healthy
Test_1051

In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def save_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['healthy', 'multiple_diseases', 'rust', 'scab'],
                yticklabels=['healthy', 'multiple_diseases', 'rust', 'scab'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.savefig(f'{model_name}_confusion_matrix.png')
    plt.close()

# Example for SVM
y_pred_svm = svm_model.predict(X_test_scaled)
save_confusion_matrix(y_test, y_pred_svm, "SVM")


In [35]:
accuracies = {
    'SVM': accuracy_score(y_test, svm_model.predict(X_test_scaled)),
    'Random Forest': accuracy_score(y_test, rf_model.predict(X_test)),
    'XGBoost': accuracy_score(y_test, xgb_model.predict(X_test))
}

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette='viridis')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig("model_accuracy_comparison.png")
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette='viridis')


In [36]:
from sklearn.metrics import classification_report

# SVM Example
report = classification_report(y_test, y_pred_svm, target_names=['healthy', 'multiple_diseases', 'rust', 'scab'])
with open("svm_classification_report.txt", "w") as f:
    f.write(report)


In [39]:
print("Length of image_ids:", len(image_ids))
print("Length of predicted_labels:", len(predicted_labels))


Length of image_ids: 3642
Length of predicted_labels: 365


In [40]:
image_ids = [f"test_img_{i}" for i in range(len(predicted_labels))]


In [42]:
# Only if you're using real test images (not internal sklearn test split)
# Make sure image_ids and y_pred are from the same test set
image_ids = [f"test_img_{i}" for i in range(len(predicted_labels))]  # if actual names are missing

results_df = pd.DataFrame({
    'image_id': image_ids,
    'predicted_label': predicted_labels
})

results_df.to_csv("svm_predictions.csv", index=False)
print("✅ Predictions saved to svm_predictions.csv")


✅ Predictions saved to svm_predictions.csv


In [43]:
import pandas as pd

# Map predicted labels
label_map = {0: 'healthy', 1: 'multiple_diseases', 2: 'rust', 3: 'scab'}
predicted_labels = [label_map[i] for i in y_pred_svm]

# Assuming you have image IDs for the test set
results_df = pd.DataFrame({
    'image_id': image_ids,  # Make sure this list is available from earlier
    'predicted_label': predicted_labels
})

results_df.to_csv("svm_predictions.csv", index=False)
