In [12]:
import os
import cv2
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator

In [13]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
resume_folder = '/content/drive/MyDrive/resumes'
non_resume_folder = '/content/drive/MyDrive/not_resumes'

In [15]:
# Load and Explore the Data
def load_and_preprocess_data(data_dir, label, augmentation):
    data = []
    labels = []

    for filename in os.listdir(data_dir):
        img_path = os.path.join(data_dir, filename)
        img = cv2.imread(img_path)
        img = cv2.resize(img, (128, 128))
        img = img / 255.0
        data.append(img)
        labels.append(label)

    data = np.array(data)
    labels = np.array(labels)

    if augmentation:
        datagen = ImageDataGenerator(
            rotation_range=20,
            width_shift_range=0.2,
            height_shift_range=0.2,
            zoom_range=0.2,
            shear_range=0.2,
            horizontal_flip=True,
            vertical_flip=True
        )
        datagen.fit(data)
        augmented_data = datagen.flow(data, labels, batch_size=len(data), shuffle=False).next()
        data, labels = augmented_data[0], labels

    return data, labels

In [16]:
# Load and preprocess data from the two folders
resume_data_aug, resume_labels_aug = load_and_preprocess_data(resume_folder, 1, True)
non_resume_data_aug, non_resume_labels_aug = load_and_preprocess_data(non_resume_folder, 0, True)

resume_data, resume_labels = load_and_preprocess_data(resume_folder, 1, False)
non_resume_data, non_resume_labels = load_and_preprocess_data(non_resume_folder, 0, False)

In [17]:
# Combine data and labels
X_train_aug = np.concatenate((resume_data_aug, non_resume_data_aug), axis=0)
y_train_aug = np.concatenate((resume_labels_aug, non_resume_labels_aug), axis=0)

X_train = np.concatenate((resume_data, non_resume_data), axis=0)
y_train = np.concatenate((resume_labels, non_resume_labels), axis=0)

In [18]:
# Shuffle the data
augmented_data = list(zip(X_train_aug, y_train_aug))
np.random.shuffle(augmented_data)
X_train_aug, y_train_aug = zip(*augmented_data)
X_train_aug = np.array(X_train_aug)
y_train_aug = np.array(y_train_aug)

In [19]:
# Convert labels to one-hot encoding
y_train_aug = to_categorical(y_train_aug, num_classes=2)
y_train = to_categorical(y_train, num_classes=2)

In [20]:
# Split the data into training and testing sets
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train_aug, y_train_aug, test_size=0.3, random_state=42
)

In [21]:
# Build a basic CNN model
model = keras.Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the CNN model
model.fit(X_train_split, y_train_split, epochs=10, validation_data=(X_test_split, y_test_split))

# Use CNN predictions as weak learner for AdaBoost
cnn_predictions = model.predict(X_train_split)
cnn_predictions = np.argmax(cnn_predictions, axis=1)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# Build AdaBoost model with DecisionTree as base estimator
base_estimator = DecisionTreeClassifier(max_depth=1)
adaboost_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Train AdaBoost model
adaboost_model.fit(X_train_split.reshape(X_train_split.shape[0], -1), cnn_predictions)

# Make predictions on the test set
cnn_test_predictions = model.predict(X_test_split)
cnn_test_predictions = np.argmax(cnn_test_predictions, axis=1)
adaboost_predictions = adaboost_model.predict(X_test_split.reshape(X_test_split.shape[0], -1))

# Combine CNN and AdaBoost predictions
ensemble_predictions = np.vstack((cnn_test_predictions, adaboost_predictions)).T

# Use majority voting for final predictions
final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions)

# Evaluate the performance of the ensemble model
ensemble_accuracy = accuracy_score(y_test_split.argmax(axis=1), final_predictions)
print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100:.2f}%')




Ensemble Model Accuracy: 56.67%
