In [None]:
import os
os.chdir("C:\\COLON_CANCER DATASET")

In [None]:
import os
import cv2
import albumentations as A
from tqdm import tqdm
import numpy as np

def augment_masks(input_mask_dir, output_mask_dir, num_augmented=5):
    if not os.path.exists(output_mask_dir):
        os.makedirs(output_mask_dir)
    
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.3),
        A.Rotate(limit=45, p=0.7),
        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=30, p=0.5),
        A.ElasticTransform(alpha=1, sigma=50, p=0.3),  
        A.GridDistortion(p=0.3)
    ])
    
    for mask_name in tqdm(os.listdir(input_mask_dir)):
        mask_path = os.path.join(input_mask_dir, mask_name)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        
        if mask is None:
            continue
        
        for i in range(num_augmented):
            augmented = transform(image=mask)['image']
            
            save_mask_path = os.path.join(output_mask_dir, f"aug_{i}_{mask_name}")
            cv2.imwrite(save_mask_path, augmented)

if __name__ == "__main__":
    input_mask_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\COMBINED_PICS_MASK" 
    output_mask_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\COMBINED_PICS_AUG_MASK"  
    
    augment_masks(input_mask_folder, output_mask_folder, num_augmented=15)


In [None]:
import os
import random
import shutil

def split_dataset(input_dir, output_base_dir, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2):
    assert train_ratio + val_ratio + test_ratio == 1
    
    train_dir = os.path.join(output_base_dir, "train")
    val_dir = os.path.join(output_base_dir, "val")
    test_dir = os.path.join(output_base_dir, "test")
    
    for folder in [train_dir, val_dir, test_dir]:
        if not os.path.exists(folder):
            os.makedirs(folder)
    
    image_files = os.listdir(input_dir)
    random.shuffle(image_files)
    
    train_split = int(len(image_files) * train_ratio)
    val_split = int(len(image_files) * (train_ratio + val_ratio))
    
    train_files = image_files[:train_split]
    val_files = image_files[train_split:val_split]
    test_files = image_files[val_split:]
    
    for file in train_files:
        shutil.copy(os.path.join(input_dir, file), os.path.join(train_dir, file))
    for file in val_files:
        shutil.copy(os.path.join(input_dir, file), os.path.join(val_dir, file))
    for file in test_files:
        shutil.copy(os.path.join(input_dir, file), os.path.join(test_dir, file))
    
    print(f"Dataset split complete: {len(train_files)} train, {len(val_files)} val, {len(test_files)} test")

if __name__ == "__main__":
    input_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\COMBINED_PICS_AUG_MASK" 
    dataset_split_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\AUG_PICS_MASK"  
    
    split_dataset(input_folder, dataset_split_folder)


In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def process_tumor_mask(mask_path):
    mask = cv2.imread(mask_path, 0)  
    _, binary_mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
    
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return os.path.basename(mask_path), None, np.zeros(5)
    
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    bounding_box = (x, y, w, h)
    
    area = cv2.contourArea(largest_contour)
    perimeter = cv2.arcLength(largest_contour, True)
    aspect_ratio = float(w) / h if h > 0 else 0
    convex_hull_area = cv2.contourArea(cv2.convexHull(largest_contour))
    solidity = area / convex_hull_area if convex_hull_area > 0 else 0   
    extent = area / (w * h)  
    features = np.array([area, perimeter, aspect_ratio, solidity, extent])
    
    return os.path.basename(mask_path), bounding_box, features

def process_directory(mask_dir, output_excel):
    results = []
    for filename in os.listdir(mask_dir):
        if filename.endswith(('.png', '.jpg', '.jpeg')): 
            mask_path = os.path.join(mask_dir, filename)
            image_name, bounding_box, features = process_tumor_mask(mask_path)
            
            results.append([image_name, bounding_box, *features])
    
    df = pd.DataFrame(results, columns=['Image Name', 'Bounding Box', 'Area', 'Perimeter', 'Aspect Ratio', 'Solidity', 'Extent'])
    df.to_excel(output_excel, index=False)
    print(f"Results saved to {output_excel}")


mask_directory = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\COMBINED_PICS_AUG_MASK"
output_file = "tumor_features.xlsx"

process_directory(mask_directory, output_file)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_excel("tumor_features.xlsx")
df.drop(columns=["Image Name", "Bounding Box"], inplace=True)  

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


file_path = "tumor_features.xlsx"  
df = pd.read_excel(file_path)


plt.figure(figsize=(8, 5))
plt.hist(df['Area'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Tumor Area')
plt.ylabel('Frequency')
plt.title('Distribution of Tumor Area')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


file_path = "tumor_features.xlsx"  
df = pd.read_excel(file_path)


plt.figure(figsize=(8, 5))
plt.hist(df['Perimeter'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Tumor perimeter')
plt.ylabel('Frequency')
plt.title('Distribution of Tumor perimeter')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


file_path = "tumor_features.xlsx"  
df = pd.read_excel(file_path)


plt.figure(figsize=(8, 5))
plt.hist(df['Solidity'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Tumor Solidity')
plt.ylabel('Frequency')
plt.title('Distribution of Tumor Solidity')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


file_path = "tumor_features.xlsx"  
df = pd.read_excel(file_path)


plt.figure(figsize=(8, 5))
plt.hist(df['Aspect Ratio'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Tumor Aspectratio')
plt.ylabel('Frequency')
plt.title('Distribution of Tumor Aspectratio')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


file_path = "tumor_features.xlsx"  
df = pd.read_excel(file_path)


plt.figure(figsize=(8, 5))
plt.hist(df['Extent'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Tumor extent')
plt.ylabel('Frequency')
plt.title('Distribution of Tumor extent')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture

def compute_thresholds(df):
    thresholds = {}
    for feature in ['Area', 'Perimeter','Solidity','Aspect Ratio','Extent']:
        gm = GaussianMixture(n_components=2, random_state=42)
        gm.fit(df[[feature]])
        means = gm.means_.flatten()
        thresholds[feature] = np.mean(means)  
    return thresholds


df = pd.read_excel("tumor_features.xlsx")


thresholds = compute_thresholds(df)
print("Computed Thresholds:", thresholds)


In [None]:
import pandas as pd


df = pd.read_excel("tumor_features.xlsx")

df = df.drop(columns=['Bounding Box'], errors='ignore')


df['Label'] = ((df['Area'] > thresholds['Area'])|(df['Perimeter']>thresholds['Perimeter'])|(df['Extent'] > thresholds['Extent'])|(df['Aspect Ratio'] > thresholds['Aspect Ratio'])|(df['Solidity'] > thresholds['Solidity'])).astype(int)

df.to_excel("tumor_features_with_labels.xlsx", index=False)

print("Updated dataset saved with 'Label' column.") 

In [None]:
import pandas as pd

df=pd.read_excel("tumor_features_with_labels.xlsx")

count_0=0

count_1=0

for idx,row in df.iterrows():
    label=row['Label']
    if label==0:
        count_0+=1
    else:
        count_1+=1

print(count_0)
print(count_1)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pandas as pd
import numpy as np


df = pd.read_excel("tumor_features_with_labels.xlsx") 


filenames = df["Image Name"]  


X = df.select_dtypes(include=[np.number]).drop(columns=["Label"])  
y = df["Label"]


oversample = SMOTE(sampling_strategy=0.5, random_state=42)
undersample = RandomUnderSampler(sampling_strategy=0.75, random_state=42)
resampling_pipeline = Pipeline([("oversample", oversample), ("undersample", undersample)])


X_resampled, y_resampled = resampling_pipeline.fit_resample(X, y)


df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled["Label"] = y_resampled


df_resampled["filename"] = filenames.sample(n=len(df_resampled), replace=True, random_state=42).values


df_resampled.to_excel("tumor_features_resampled.xlsx", index=False)

print("Resampled dataset saved with balanced 'Label' column and filenames.")


In [None]:
import pandas as pd

df=pd.read_excel("tumor_features_resampled.xlsx")

count_0=0

count_1=0

for idx,row in df.iterrows():
    label=row['Label']
    if label==0:
        count_0+=1
    else:
        count_1+=1

print(count_0)
print(count_1)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=3)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

print("Selected Features:", list(selected_features))

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd
import numpy as np


df = pd.read_excel("tumor_features_with_labels.xlsx") 


filenames = df["Image Name"]  


X = df.select_dtypes(include=[np.number]).drop(columns=["Label"])  
y = df["Label"]


selector = SelectKBest(score_func=f_classif, k=3)
selector.fit(X, y)


for feature, score in zip(X.columns, selector.scores_):
    print(f"{feature}: {score:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


df_selected = pd.DataFrame(X_selected, columns=["area", "solidity", "extent"])
df_selected['Label'] = y 


fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')


scatter = ax.scatter(df_selected["area"], df_selected["solidity"], df_selected["extent"], 
                     c=df_selected["Label"], cmap='viridis', edgecolor='k', alpha=0.8)


ax.set_xlabel("Area")
ax.set_ylabel("Solidity")
ax.set_zlabel("Extent")
ax.set_title("3D Scatter Plot of Selected Features")


legend = fig.colorbar(scatter, ax=ax, pad=0.1)
legend.set_label("Class Labels")

plt.show()


In [None]:
import os
import pandas as pd
import numpy as np
import cv2
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt




mask_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\AUG_PICS_MASK\\train"  
excel_file = "tumor_features_resampled.xlsx" 


df = pd.read_excel(excel_file) 


features = []
labels = []


for filename in sorted(os.listdir(mask_folder)):  
    if filename.endswith(".jpg"):  
        mask_path = os.path.join(mask_folder, filename)
        
      
        row = df[df["filename"] == filename]
        
        if not row.empty:
            area = row["Area"].values[0]
            solidity = row["Solidity"].values[0]
            extent = row["Extent"].values[0]
            label = row["Label"].values[0]  
         
            features.append([area, solidity, extent])
            labels.append(label)


X_train = np.array(features)
y_train = np.array(labels)


In [None]:
print(X_train.shape)
print(y_train.shape)

np.save("X_Train.npy",X_train)
np.save("y_train.npy",y_train)

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt




mask_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\AUG_PICS_MASK\\test"  
excel_file = "tumor_features_resampled.xlsx" 


df = pd.read_excel(excel_file) 


features = []
labels = []


for filename in sorted(os.listdir(mask_folder)):  
    if filename.endswith(".jpg"):  
        mask_path = os.path.join(mask_folder, filename)
        
      
        row = df[df["filename"] == filename]
        
        if not row.empty:
            area = row["Area"].values[0]
            solidity = row["Solidity"].values[0]
            extent = row["Extent"].values[0]
            label = row["Label"].values[0]  
         
            features.append([area, solidity, extent])
            labels.append(label)


X_test = np.array(features)
y_test = np.array(labels)

print(X_test.shape)
print(y_test.shape)

np.save("X_test.npy",X_test)
np.save("y_test.npy",y_test)


In [None]:
import os
import pandas as pd
import numpy as np
import cv2
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt




mask_folder = "C:\\COLON_CANCER DATASET\\PolypDB\\PolypDB_modality_wise\\AUG_PICS_MASK\\val"  
excel_file = "tumor_features_resampled.xlsx" 


df = pd.read_excel(excel_file) 


features = []
labels = []


for filename in sorted(os.listdir(mask_folder)):  
    if filename.endswith(".jpg"):  
        mask_path = os.path.join(mask_folder, filename)
        
      
        row = df[df["filename"] == filename]
        
        if not row.empty:
            area = row["Area"].values[0]
            solidity = row["Solidity"].values[0]
            extent = row["Extent"].values[0]
            label = row["Label"].values[0]  
         
            features.append([area, solidity, extent])
            labels.append(label)


X_val = np.array(features)
y_val = np.array(labels)

print(X_val.shape)
print(y_val.shape)

np.save("X_val.npy",X_val)
np.save("y_val.npy",y_val)


In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', C=1, gamma=0.01)
svm_model.fit(X_train, y_train)

train_acc = svm_model.score(X_train, y_train)
test_acc = svm_model.score(X_test, y_test)
val_acc=svm_model.score(X_val,y_val)

print(f"Train Accuracy: {train_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")
print(f"Validation Accuracy: {val_acc:.2f}")



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=50, random_state=42,max_depth=5)
rf_model.fit(X_train, y_train)

print(f"Train Accuracy: {rf_model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {rf_model.score(X_test, y_test):.2f}")
print(f"Validation Accuracy: {rf_model.score(X_val, y_val):.2f}")


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

print(f"Train Accuracy: {xgb_model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {xgb_model.score(X_test, y_test):.2f}")
print(f"Validation Accuracy: {xgb_model.score(X_val, y_val):.2f}")




In [None]:
import pandas as pd


df_resampled = pd.read_excel("tumor_features_resampled.xlsx")


X_resampled = df_resampled.drop(columns=['Label','filename'])  
y_resampled = df_resampled['Label']  

print("Loaded X_resampled and y_resampled from the Excel file.")
print("X_resampled shape:", X_resampled.shape)
print("y_resampled shape:", y_resampled.shape)


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm_model, X_resampled, y_resampled, cv=10, scoring='accuracy')
print("Cross-validation Accuracy:", scores.mean())


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=10, scoring='accuracy')
print("Cross-validation Accuracy:", scores.mean())


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_model, X_resampled, y_resampled, cv=10, scoring='accuracy')
print("Cross-validation Accuracy:", scores.mean())


In [None]:
from sklearn.svm import SVC
import numpy as np

svm_model = SVC(kernel='rbf', C=1, gamma=0.01)
svm_model.fit(X_train, y_train)



def compute_iou(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    union = np.logical_or(pred_mask, true_mask).sum()
    iou = intersection / union if union != 0 else 1.0
    return iou

def compute_dice(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    dice = (2. * intersection) / (pred_mask.sum() + true_mask.sum()) if (pred_mask.sum() + true_mask.sum()) != 0 else 1.0
    return dice

y_pred_svm=svm_model.predict(X_test)

iou = compute_iou(y_pred_svm,y_test)
dice = compute_dice(y_pred_svm,y_test)

print(f"IoU: {iou:.4f}")
print(f"Dice Coefficient: {dice:.4f}")




In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rf_model = RandomForestClassifier(n_estimators=50, random_state=42,max_depth=5)
rf_model.fit(X_train, y_train)

def compute_iou(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    union = np.logical_or(pred_mask, true_mask).sum()
    iou = intersection / union if union != 0 else 1.0
    return iou

def compute_dice(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    dice = (2. * intersection) / (pred_mask.sum() + true_mask.sum()) if (pred_mask.sum() + true_mask.sum()) != 0 else 1.0
    return dice

y_pred_rf=rf_model.predict(X_test)

iou = compute_iou(y_pred_rf,y_test)
dice = compute_dice(y_pred_rf,y_test)

print(f"IoU: {iou:.4f}")
print(f"Dice Coefficient: {dice:.4f}")



In [None]:
from xgboost import XGBClassifier
import numpy as np

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

def compute_iou(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    union = np.logical_or(pred_mask, true_mask).sum()
    iou = intersection / union if union != 0 else 1.0
    return iou

def compute_dice(pred_mask, true_mask):
    
    intersection = np.logical_and(pred_mask, true_mask).sum()
    dice = (2. * intersection) / (pred_mask.sum() + true_mask.sum()) if (pred_mask.sum() + true_mask.sum()) != 0 else 1.0
    return dice

y_pred_xgb=xgb_model.predict(X_test)

iou = compute_iou(y_pred_xgb,y_test)
dice = compute_dice(y_pred_xgb,y_test)

print(f"IoU: {iou:.4f}")
print(f"Dice Coefficient: {dice:.4f}")


In [None]:
from sklearn.metrics import roc_auc_score, log_loss, roc_curve
import matplotlib.pyplot as plt


auc_svm = roc_auc_score(y_test, y_pred_svm)
auc_rf = roc_auc_score(y_test, y_pred_rf)
auc_xgb = roc_auc_score(y_test, y_pred_xgb)


logloss_svm = log_loss(y_test, y_pred_svm)
logloss_rf = log_loss(y_test, y_pred_rf)
logloss_xgb = log_loss(y_test, y_pred_xgb)

print(f"SVM - ROC AUC: {auc_svm:.4f}, Log Loss: {logloss_svm:.4f}")
print(f"Random Forest - ROC AUC: {auc_rf:.4f}, Log Loss: {logloss_rf:.4f}")
print(f"XGBoost - ROC AUC: {auc_xgb:.4f}, Log Loss: {logloss_xgb:.4f}")


In [None]:
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {auc_svm:.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multimodal Classifiers')
plt.legend(loc='lower right')
plt.grid()
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np


models = ['SVM', 'Random Forest', 'XGBoost']
roc_auc = [0.8006, 0.9919, 0.9959]
log_losses = [7.66, 0.2375, 0.1188]

x = np.arange(len(models))
bar_width = 0.35

fig, ax1 = plt.subplots(figsize=(8, 6))


bars1 = ax1.bar(x - bar_width/2, roc_auc, bar_width, label='ROC AUC', color='#1f77b4')
ax1.set_ylabel('ROC AUC Score', color='#1f77b4')
ax1.tick_params(axis='y', labelcolor='#1f77b4')


ax2 = ax1.twinx()
bars2 = ax2.bar(x + bar_width/2, log_losses, bar_width, label='Log Loss', color='#ff7f0e')
ax2.set_ylabel('Log Loss', color='#ff7f0e')
ax2.tick_params(axis='y', labelcolor='#ff7f0e')


ax1.set_xticks(x)
ax1.set_xticklabels(models)
ax1.set_title('ROC AUC vs Log Loss for Mask Classifiers')


lines = bars1 + bars2
labels = [bar.get_label() for bar in lines]
ax1.legend(lines, ['ROC AUC', 'Log Loss'], loc='upper center')

plt.tight_layout()
plt.show()
