In [112]:
import os
import cv2
import numpy as np
import keras
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, log_loss, cohen_kappa_score, roc_auc_score
from keras.models import Model
from keras.layers import Dense, Dropout, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import MobileNetV2
from keras.optimizers import Adam
from keras import Input
from sklearn.model_selection import train_test_split

In [113]:
resume_folder_for_training = '/content/drive/MyDrive/datasets used for part 10/resumes'
non_resume_folder_for_training = '/content/drive/MyDrive/datasets used for part 10/not_resumes'
resume_folder_for_testing ='/content/drive/MyDrive/datasets used for part 10/resumes for testing'
non_resume_folder_for_testing = '/content/drive/MyDrive/datasets used for part 10/not_resumes for testing'

In [114]:
# Load and Explore the Data
def load_and_preprocess_data(data_dir, label, augmentation):
    data = []
    labels = []
    for filename in os.listdir(data_dir):
        img_path = os.path.join(data_dir, filename)

        # Check if file exists and is a valid image
        if os.path.isfile(img_path):
            img = cv2.imread(img_path)

            # Check if image is successfully loaded
            if img is not None:
                img = cv2.resize(img, (128, 128))
                img = img / 255.0
                data.append(img)
                labels.append(label)

    data = np.array(data)
    labels = np.array(labels)

    if augmentation:
        datagen = ImageDataGenerator(
            rotation_range=20,
            width_shift_range=0.2,
            height_shift_range=0.2,
            zoom_range=0.1,
            shear_range=0.2,
            horizontal_flip=True
        )
        datagen.fit(data)
        augmented_data = datagen.flow(data, labels, batch_size=len(data), shuffle=False).next()
        data, labels = augmented_data[0], labels

    return data, labels

In [115]:
# Load and preprocess data from the two folders
resume_data_aug, resume_labels_aug = load_and_preprocess_data(resume_folder_for_training , 1, True)
non_resume_data_aug, non_resume_labels_aug = load_and_preprocess_data(non_resume_folder_for_training , 0, True)

resume_data, resume_labels = load_and_preprocess_data(resume_folder_for_training , 1, False)
non_resume_data, non_resume_labels = load_and_preprocess_data(non_resume_folder_for_training , 0, False)

resume_data_for_testing,  resume_data_for_testing_labels = load_and_preprocess_data(resume_folder_for_testing , 1, False)
non_resume_data_for_testing,  non_resume_data_for_testing_labels = load_and_preprocess_data(non_resume_folder_for_testing , 0, False)

In [116]:
# Combine data and labels
X_train_aug = np.concatenate((resume_data_aug, non_resume_data_aug), axis=0)
y_train_aug = np.concatenate((resume_labels_aug, non_resume_labels_aug), axis=0)

X_train = np.concatenate((resume_data, non_resume_data), axis=0)
y_train = np.concatenate((resume_labels, non_resume_labels), axis=0)

X_test = np.concatenate((resume_data_for_testing, non_resume_data_for_testing), axis=0)
y_test = np.concatenate((resume_labels, non_resume_labels), axis=0)

In [117]:
# Shuffle the data
augmented_data = list(zip(X_train_aug, y_train_aug))
np.random.shuffle(augmented_data)
X_train_aug, y_train_aug = zip(*augmented_data)
X_train_aug = np.array(X_train_aug)
y_train_aug = np.array(y_train_aug)

In [118]:
import pandas as pd

# Assuming 'y_train' is your training labels
class_distribution = pd.Series(y_train.flatten()).value_counts()
print("Class Distribution:")
print(class_distribution)

Class Distribution:
1    90
0    90
dtype: int64


In [119]:
# Split the data into training and testing sets
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train_aug, y_train_aug, test_size=0.15, random_state=42 , stratify=y_train_aug
)

In [120]:
# Define ImageDataGenerator for training data
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.1,
    shear_range=0.2,
    horizontal_flip=True
)

# Define ImageDataGenerator for testing data
test_datagen = ImageDataGenerator(rescale=1./255)

batch_size = 32

# Flow training images in batches of 32 using train_datagen generator
train_generator = train_datagen.flow(
    X_train_split,
    y_train_split,
    batch_size=batch_size
)

# Flow testing images in batches of 32 using test_datagen generator
test_generator = test_datagen.flow(
    X_test_split,
    y_test_split,
    batch_size=batch_size
)

In [121]:
# pre-trained MobileNetV2 model with weights from 'imagenet'
base_model = MobileNetV2(weights='imagenet', include_top=False, input_tensor=Input(shape=(128, 128, 3)))

# Freeze the convolutional layers to retain pre-trained features
for layer in base_model.layers:
    layer.trainable = False

# Add custom classification layers
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(256, activation='relu')(x)
dropout = 0.4
x = Dense(1, activation='sigmoid')(x)



In [122]:
# Create the final model
model = Model(inputs=base_model.input, outputs=x)

model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])



In [123]:
# Train the model
epochs = 10

history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=test_generator
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.6813
Test Accuracy: 62.96%


In [124]:
# Evaluate the model on the test set
y_pred = model.predict(test_generator)
y_test_binary = y_test_split.astype(int)

# Thresholds
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6]

for threshold in thresholds:
    y_pred_binary = (y_pred > threshold).astype(int)

    # Calculate metrics
    py_pred_binary = (y_pred > threshold).astype(int)
    precision = precision_score(y_test_binary, y_pred_binary)
    recall = recall_score(y_test_binary, y_pred_binary)
    f1 = f1_score(y_test_binary, y_pred_binary)
    logloss = log_loss(y_test_binary, y_pred_binary)
    kappa = cohen_kappa_score(y_test_binary, y_pred_binary)

    # ROC-AUC requires binary or multilabel indicator format
    roc_auc = roc_auc_score(y_test_binary, y_pred_binary)

    print(f"Threshold: {threshold}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Log Loss: {logloss:.4f}, Cohen's Kappa: {kappa:.4f}, ROC-AUC: {roc_auc:.4f}")

# Choose an appropriate threshold, let's take 0.5
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate metrics
precision = precision_score(y_test_split, y_pred_binary)
recall = recall_score(y_test_split, y_pred_binary)
f1 = f1_score(y_test_split, y_pred_binary)
logloss = log_loss(y_test_split, y_pred_binary)
kappa = cohen_kappa_score(y_test_split, y_pred_binary)
roc_auc = roc_auc_score(y_test_split, y_pred_binary)

# Confusion matrix
conf_matrix = confusion_matrix(y_test_split, y_pred_binary)

print("\nOverall Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Log Loss:", logloss)
print("Cohen's Kappa:", kappa)
print("ROC-AUC:", roc_auc)
print("\nConfusion Matrix:")
print(conf_matrix)


Threshold: 0.2, Precision: 0.4815, Recall: 1.0000, F1: 0.6500, Log Loss: 18.6893, Cohen's Kappa: 0.0000, ROC-AUC: 0.5000
Threshold: 0.3, Precision: 0.4815, Recall: 1.0000, F1: 0.6500, Log Loss: 18.6893, Cohen's Kappa: 0.0000, ROC-AUC: 0.5000
Threshold: 0.4, Precision: 0.4815, Recall: 1.0000, F1: 0.6500, Log Loss: 18.6893, Cohen's Kappa: 0.0000, ROC-AUC: 0.5000
Threshold: 0.5, Precision: 0.4348, Recall: 0.7692, F1: 0.5556, Log Loss: 21.3592, Cohen's Kappa: -0.1551, ROC-AUC: 0.4203
Threshold: 0.6, Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Log Loss: 17.3544, Cohen's Kappa: 0.0000, ROC-AUC: 0.5000

Overall Metrics:
Precision: 0.43478260869565216
Recall: 0.7692307692307693
F1 Score: 0.5555555555555555
Log Loss: 21.35920200836572
Cohen's Kappa: -0.15508021390374327
ROC-AUC: 0.4203296703296703

Confusion Matrix:
[[ 1 13]
 [ 3 10]]


  _warn_prf(average, modifier, msg_start, len(result))
