In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf

# Load your dataset
df = pd.read_csv("new_datasets/balanced_resume_dataset_realistic_noisy.csv")  # Replace with actual path

# Feature & target separation
X = df[['resume_uploaded', 'batch_resume_uploaded_pct']]
y = df['should_nudge_resume']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(2,)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# Early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6973 - loss: 0.6292 - precision: 0.5528 - recall: 0.7605 - val_accuracy: 0.6444 - val_loss: 0.5508 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9077 - loss: 0.2901 - precision: 0.8533 - recall: 0.8923 - val_accuracy: 0.6444 - val_loss: 0.5631 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9289 - loss: 0.2613 - precision: 0.9090 - recall: 0.8902 - val_accuracy: 0.6444 - val_loss: 0.5195 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9228 - loss: 0.2601 - precision: 0.9078 - recall: 0.8693 - val_accuracy: 0.8119 - val_loss: 0.3746 - val_precision: 0.9379 - val_recall: 0.

In [3]:
df['should_nudge_resume'].value_counts()

should_nudge_resume
0    6450
1    3550
Name: count, dtype: int64

In [4]:
from sklearn.metrics import confusion_matrix, accuracy_score

test_df = pd.read_csv('new_datasets/balanced_test_resume_dataset_realistic_noisy.csv')

X_test_new = test_df[['resume_uploaded', 'batch_resume_uploaded_pct']]
y_test_new = test_df['should_nudge_resume']

y_pred_new = (model.predict(X_test_new) > 0.5).astype(int)

# Print metrics
print("\nMetrics on Test Dataset:")
print("\nClassification Report:")
print(classification_report(y_test_new, y_pred_new))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_new, y_pred_new))

print("\nAccuracy Score:", accuracy_score(y_test_new, y_pred_new))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Metrics on Test Dataset:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1281
           1       0.96      0.92      0.94       719

    accuracy                           0.96      2000
   macro avg       0.96      0.95      0.95      2000
weighted avg       0.96      0.96      0.96      2000


Confusion Matrix:
[[1255   26]
 [  57  662]]

Accuracy Score: 0.9585


In [9]:
model.save('models/model_resume.h5')



In [6]:
# Create and train Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the model using the same training data
rf_model.fit(X_train, y_train)

# Make predictions on both test sets
rf_pred = rf_model.predict(X_test)
rf_pred_new = rf_model.predict(X_test_new)

print("\nRandom Forest Results:")
print("\nOriginal Test Set Metrics:")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("\nAccuracy Score:", accuracy_score(y_test, rf_pred))

print("\nNew Test Set Metrics:")
print("\nClassification Report:")
print(classification_report(y_test_new, rf_pred_new))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_new, rf_pred_new))
print("\nAccuracy Score:", accuracy_score(y_test_new, rf_pred_new))



Random Forest Results:

Original Test Set Metrics:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1290
           1       0.95      0.92      0.93       710

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000


Confusion Matrix:
[[1252   38]
 [  55  655]]

Accuracy Score: 0.9535

New Test Set Metrics:

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1281
           1       0.96      0.92      0.94       719

    accuracy                           0.96      2000
   macro avg       0.96      0.95      0.95      2000
weighted avg       0.96      0.96      0.96      2000


Confusion Matrix:
[[1255   26]
 [  57  662]]

Accuracy Score: 0.9585


In [8]:
from joblib import dump
dump(rf_model, 'models/random_forest_resume.joblib')

['models/random_forest_resume.joblib']