In [22]:
# best so far
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC  # Support Vector Machine
from sklearn.ensemble import RandomForestClassifier  # Random Forest
from sklearn.metrics import accuracy_score  # For model evaluation

# Load the labeled CSV
df = pd.read_csv('/content/selected_features_tfidf_matrix.csv')

# Features (X) and labels (y)
X = df.drop(columns=['label']).values
y = df['label'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. Neural Network Model (Keras)
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_dim=X_train.shape[1]),
    tf.keras.layers.Dropout(0.2),  # Dropout for regularization
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),  # Dropout for regularization
    tf.keras.layers.Dense(3, activation='softmax')  # Assuming 3 classes
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the neural network
print("Training Neural Network...")
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the neural network
nn_loss, nn_accuracy = nn_model.evaluate(X_test, y_test)
print(f"Neural Network Model Accuracy: {nn_accuracy * 100:.2f}%")

# 2. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)

# Train the SVM
print("Training Support Vector Machine...")
svm_model.fit(X_train, y_train)

# Make predictions and evaluate SVM
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Model Accuracy: {svm_accuracy * 100:.2f}%")

# 3. Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest
print("Training Random Forest...")
rf_model.fit(X_train, y_train)

# Make predictions and evaluate Random Forest
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Model Accuracy: {rf_accuracy * 100:.2f}%")


Training Neural Network...
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 186ms/step - accuracy: 0.4364 - loss: 1.1168 - val_accuracy: 0.6000 - val_loss: 1.0607
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5019 - loss: 1.0599 - val_accuracy: 0.6000 - val_loss: 1.0688
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.3814 - loss: 1.0615 - val_accuracy: 0.6000 - val_loss: 1.0792
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4677 - loss: 1.0467 - val_accuracy: 0.6000 - val_loss: 1.0889
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.4810 - loss: 1.0429 - val_accuracy: 0.6000 - val_loss: 1.0995
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.4744 - loss: 1.0204 - val_accuracy: 0.6000 - val_loss: 1.1114
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [27]:

# try on extracted
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Step 1: Load the CSV data (adjust the path to your actual file)
df = pd.read_csv('/content/extracted_resume_info.csv')

# Step 2: Preprocess the text data to handle missing values (NaN)
# Replace NaN with empty string or a placeholder
df['skills'] = df['skills'].fillna('')
df['education'] = df['education'].fillna('')
df['experience'] = df['experience'].fillna('')

# Step 3: Combine the features (skills, education, experience) into a single column
df['combined_features'] = df['skills'] + ' ' + df['education'] + ' ' + df['experience']

# Step 4: Preprocess the labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # Encoding the labels

# Step 5: Split data into features and target
X = df['combined_features']  # Features
y = df['label']  # Target

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 8: Train a classifier (RandomForest in this case)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Step 9: Make predictions
y_pred = model.predict(X_test_tfidf)

# Step 10: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Optionally, save the model for future use
joblib.dump(model, 'resume_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


Classification Report:
              precision    recall  f1-score   support

    Beginner       0.00      0.00      0.00         1
      Expert       0.50      0.67      0.57         6
Intermediate       0.00      0.00      0.00         3

    accuracy                           0.40        10
   macro avg       0.17      0.22      0.19        10
weighted avg       0.30      0.40      0.34        10



['tfidf_vectorizer.pkl']

In [19]:
# using kfold
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Load the labeled CSV
df = pd.read_csv('/content/selected_features_tfidf_matrix.csv')

# Features (X) and labels (y)
X = df.drop(columns=['label']).values
y = df['label'].values


# Define the number of folds
k = 5  # You can change this to your desired number of folds

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Shuffle data for better generalization

# Store accuracy scores for each fold
accuracies = []

# Loop through each fold
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {fold + 1}")

    # Split data for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the neural network (same as before)
    model = Sequential([
        Dense(32, activation='relu', input_dim=X_train.shape[1], kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.5),
        Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.3),
        Dense(3, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)

    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32,
              validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)  # verbose=0 to suppress training output

    # Evaluate the model
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    accuracies.append(accuracy)
    print(f"Fold accuracy: {accuracy * 100:.2f}%")

# Calculate average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f"\nAverage accuracy across {k} folds: {average_accuracy * 100:.2f}%")

Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold accuracy: 50.00%
Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold accuracy: 60.00%
Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold accuracy: 50.00%
Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold accuracy: 60.00%
Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold accuracy: 55.56%

Average accuracy across 5 folds: 55.11%
