In [None]:
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

from torchmetrics import Accuracy
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
df=pd.read_csv('/kaggle/input/byjus-data-clean/Byjus.csv')

In [None]:
df.shape

In [None]:
def convert_age(X):
    X = X.copy()
    X['age'] = pd.to_numeric(X['age'].astype(str).str.replace('+', '', regex=False), errors='coerce')
    X['age'] = pd.cut(X['age'], 
                      bins=[18, 22, 25, 28, 32, 35, float('inf')],
                      labels=['18-22', '23-25', '26-28', '29-32', '33-35', '35+'], 
                      right=True)
    return X

In [None]:
# Education mapping
education_mapping = {'engineering & technology': ['b.e / b-tech', 'bca/mca','m.e / m-tech', 'b.voc in networking and mobile applications', 
                                                  'b.arch', 'b.eng', 'b-tech', 'cse', 'bsc bedd - computer science', 'bachelors in engineering',
                                                  'food technology', 'biotechnology', 'masters in data science','mtech', 'btech', 'mca'],
                     
                     'bsc/msc': ['bsc', 'bsc or msc', 'bachelors in philosophy'],
                     'commerce & business': ['mba', 'b.com (bachelor of commerce)', 'b.com', 'b.comm', 'bba', 'bbm', 'bms', 'bachelor of business economics',
                                             'bcom computers','bms marketing', 'pgdm banking','bcom', 'bba or bbm', 'master in management'],
                     
                     'arts & humanities': ['ba', 'ba/ma', 'ma', 'bachelor mass communication', 'bachelors in design', 'bachelor of social work', 'bachelor in eco'],
                     'healthcare & medicine': ['b.pharma', 'med', 'bpharma', 'bds', 'b.physiotherapist', 'mbbs', 'pharma d', 'b -pharm', 'bachelor in medical lab', 'bachelor in dental'],
                     'social sciences': ['masters in social work', 'msw', 'masters in clinical psychology', 'masters in social science', 'masters in sociology'],
                     'diplomas & certifications': ['pgdm', 'pgdca', 'diploma', 'post graduation diploma', 'pgdf'],
                     'phd & doctorate': ['phd', 'doctorate',  'm.phil  2020'],
                     'vocational/technical': ['b.voc', 'vocational', 'bachelor of vocational','b voc - banking & finance'],
                     'hospitality & tourism': ['hotel management', 'masters in hospitality', 'tourism']}

def edu_function(degree):
    degree = str(degree).lower()
    for category, keywords in education_mapping.items():
        for keyword in keywords:
            if keyword.lower() in degree:
                return category
    return 'Other'

def categorize_degree(X):
    X = X.copy()
    X['Education'] = X['Education'].str.lower().fillna('Other')
    X['Education'] = X['Education'].apply(edu_function)
    return X

In [None]:
for col in df.columns:
    if df[col].nunique()>10:
        continue
    print(f"{col}----{df[col].nunique()}")

In [None]:
def label_encoder(X):
    X = X.copy()
    label_encoders = {}
    categorical_cols = ['gender',
                        'marital_status',
                        'interview_mode', 
                        'mother_tongue_influence_in_english',
                        'Anyone_spoken_before_applying', 
                        'currently_employed',
                        'candidate_status']

    for col in categorical_cols:
        X[col] = X[col].fillna('Unknown')
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    return X

In [None]:
df['interview_verdict'] = df['interview_verdict'].replace({'Premium Select': 'Select', 'Borderline Select': 'Select', 'Borderline Reject': 'Reject'})

In [None]:
# Label encode the target
le_verdict = LabelEncoder()
df['interview_verdict'] = le_verdict.fit_transform(df['interview_verdict'])


In [None]:
numerical_cols = ['confidence_based_on_introduction_(english)',
                  'confidence_based_on_the_topic_given__',
                  'confidence_based_on_the_ppt_question',
                  'confidence_based_on_the_sales_scenario',
                  'structured_thinking_(in_regional_only)',
                  'structured_thinking_based_on_the_ppt_question',
                  'structured_thinking(_call_pitch)',
                  'regional_fluency_based_on_the_topic_given__',
                  'regional_fluency_based_on_the_ppt_question',
                  'regional_fluency_based_on_the__sales_scenario',
                  'confidence_score',
                  'structured_thinking_score',
                  'regional_fluency_score',
                  'total_score'
                 ]

categorical_cols = ['Education',
                    'gender',
                    'marital_status',
                    'interview_mode',
                    'mother_tongue_influence_in_english',
                    'Anyone_spoken_before_applying',
                    'currently_employed',
                    'candidate_status', 
                    'candidate_is_willing_to_relocate',
                    'last_fixed_ctc_(lakhs)', 
                    'experience_in_months',
                    'what_was_the_type_of_role?',
                    'how_many_slides_candidate_have_submitted_in_ppt?', 
                    'role_acceptance', 
                    'age']

preprocessing_pipeline = Pipeline([('convert_age', FunctionTransformer(convert_age, validate=False)),
                                   ('categorize_education', FunctionTransformer(categorize_degree, validate=False)),
                                   ('label_encoder', FunctionTransformer(label_encoder, validate=False)),
                                   ('encode_scale', ColumnTransformer(transformers=[
                                       ('onehot', OneHotEncoder(sparse_output=False, drop='first'), categorical_cols),        #  One-hot encode categorical features
                                       ('scaler', StandardScaler(), numerical_cols)                                           #  StandardScaler for all numerical columns
                                   ], remainder='passthrough'))
                                  ])

In [None]:
# Display the pipeline diagram
set_config(display='diagram')
preprocessing_pipeline

In [None]:
# Splitting the data
X = df.drop(columns=['interview_verdict']).copy()
y = df['interview_verdict']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
preprocessing_pipeline.fit(X_train, y_train)

In [None]:
X_train.shape

In [None]:
# Fit and transform with pipeline
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

In [None]:
X_train_processed.shape

# Working on NN

In [None]:
# Convert to NumPy array and float32
y_train_array = np.array(y_train).astype(np.float32)
y_test_array = np.array(y_test).astype(np.float32)

# Convert the preprocessed data into PyTorch tensors
X_train_tensor = torch.tensor(X_train_processed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)

# Convert the target variables into PyTorch tensors
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.float32)

In [None]:
# Confirm the shapes
print(f"X_train_tensor shape: {X_train_tensor.shape}, y_train_tensor shape: {y_train_tensor.shape}")
print(f"X_test_tensor shape: {X_test_tensor.shape}, y_test_tensor shape: {y_test_tensor.shape}")


In [None]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
from torch.utils.data import DataLoader

# Create dataset instances
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

# Define DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
import torch.nn as nn
class CandidateRankingModel(nn.Module):
    def __init__(self, input_dim):
        super(CandidateRankingModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = torch.sigmoid(self.fc3(x))
        return x

In [None]:
# Instantiate the model
input_dim = X_train_tensor.shape[1]
model = CandidateRankingModel(input_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [None]:
input_dim

In [None]:
# Check if model and data pipeline are working correctly
model.train()
for batch_X, batch_y in train_loader:
    batch_X, batch_y = batch_X.to(device), batch_y.to(device)
    
    # Forward pass
    outputs = model(batch_X)
    print("Batch shape:", batch_X.shape)
    print("Output shape:", outputs.shape)
    print("Sample output:", outputs[:5])
    
    break

In [None]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_labels = []
    all_preds = []

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)

        # Reshape target to match the output shape
        batch_y = batch_y.view(-1, 1)

        # Compute loss
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Store predictions and labels for metrics calculation
        preds = (outputs >= 0.5).float()  # Threshold at 0.5 for binary classification
        all_preds.extend(preds.cpu().detach().numpy())
        all_labels.extend(batch_y.cpu().detach().numpy())

    avg_loss = running_loss / len(train_loader)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc_roc = roc_auc_score(all_labels, all_preds)

    return avg_loss, accuracy, precision, recall, f1, auc_roc


In [None]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)

            # Reshape target to match the output shape
            batch_y = batch_y.view(-1, 1)

            loss = criterion(outputs, batch_y)
            running_loss += loss.item()

            preds = (outputs >= 0.5).float()
            all_preds.extend(preds.cpu().detach().numpy())
            all_labels.extend(batch_y.cpu().detach().numpy())

    avg_loss = running_loss / len(test_loader)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc_roc = roc_auc_score(all_labels, all_preds)

    return avg_loss, accuracy, precision, recall, f1, auc_roc

In [None]:
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
train_precisions = []
test_precisions = []
train_recalls = []
test_recalls = []
train_f1_scores = []
test_f1_scores = []
train_auc_roc = [] 
test_auc_roc = []

torch.manual_seed(42)
epochs = 30
start_time = time.time()

for epoch in range(1, epochs + 1):
    train_loss, train_accuracy, train_precision, train_recall, train_f1_score, train_auc_roc_value = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_accuracy, test_precision, test_recall, test_f1_score, test_auc_roc_value = evaluate(model, test_loader, criterion, device)

    # Store metrics for plotting
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    train_precisions.append(train_precision)
    test_precisions.append(test_precision)
    train_recalls.append(train_recall)
    test_recalls.append(test_recall)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_auc_roc.append(train_auc_roc_value)
    test_auc_roc.append(test_auc_roc_value)

    # Print epoch results
    print("Training Started...!!")
    print(f"\nEpoch {epoch}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f}")
    print(f"Train Precision: {train_precision:.4f} | Test Precision: {test_precision:.4f} | Train Recall: {train_recall:.4f} | Test Recall: {test_recall:.4f}")
    print(f"Train F1 Score: {train_f1_score:.4f} | Test F1 Score: {test_f1_score:.4f} | Train AUC-ROC: {train_auc_roc_value:.4f} | Test AUC-ROC: {test_auc_roc_value:.4f}")

end_time = time.time()
print(f"\nTraining completed in {end_time - start_time:.2f} seconds.")

In [None]:
# Plotting the metrics
import matplotlib.pyplot as plt

epochs_range = range(1, epochs + 1)
plt.figure(figsize=(20, 10))

plt.subplot(2, 3, 1)
plt.plot(epochs_range, train_losses, label='Train Loss')
plt.plot(epochs_range, test_losses, label='Test Loss')

# Plot Accuracy
plt.subplot(2, 3, 2)
plt.plot(epochs_range, train_accuracies, label='Train Accuracy')
plt.plot(epochs_range, test_accuracies, label='Test Accuracy')


# Plot Precision
plt.subplot(2, 3, 3)
plt.plot(epochs_range, train_precisions, label='Train Precision')
plt.plot(epochs_range, test_precisions, label='Test Precision')

# Plot Recall
plt.subplot(2, 3, 4)
plt.plot(epochs_range, train_recalls, label='Train Recall')
plt.plot(epochs_range, test_recalls, label='Test Recall')

# Plot F1 Score
plt.subplot(2, 3, 5)
plt.plot(epochs_range, train_f1_scores, label='Train F1 Score')
plt.plot(epochs_range, test_f1_scores, label='Test F1 Score')


# Plot AUC-ROC
plt.subplot(2, 3, 6)
plt.plot(epochs_range, train_auc_roc, label='Train AUC-ROC')
plt.plot(epochs_range, test_auc_roc, label='Test AUC-ROC')

plt.tight_layout()
plt.show()

In [None]:
model_path = "CandidateRanking.pth"
torch.save(model.state_dict(), model_path)
print(f"Full model saved at: {model_path}")

In [None]:
# num_features = X_train_tensor.shape[1]
# hidden_units = 128

# # Assuming `model` is your trained model
# model_path = "CandidateRankingModel_with_param.pth"
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'num_features': num_features,
#     'hidden_units': hidden_units,
# }, model_path)

# print(f"Model and parameters saved at: {model_path}")

In [None]:
#  Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

#  Load Dataset
file_path = '/kaggle/input/original-hiring-dataset/Hiring_dataset.csv'
df = pd.read_csv(file_path)

#  Preprocessing
X = df.drop(['HiringDecision'], axis=1)
y = df['HiringDecision']

#  Identify Numerical & Categorical Features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

#  Scale Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X[numerical_features])

#  Encode Categorical Features
encoder = OneHotEncoder()
X_categorical = encoder.fit_transform(X[categorical_features]).toarray()

#  Combine Preprocessed Features
X_preprocessed = np.hstack((X_numerical, X_categorical))

#  Split Data
X_train, X_test = train_test_split(X_preprocessed, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

#  Deep Learning Model (Autoencoder-like)
class ContentBasedRecommender(nn.Module):
    def __init__(self, input_dim, embedding_dim=64):
        super(ContentBasedRecommender, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, embedding_dim)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

#  Initialize Model
input_dim = X_train_tensor.shape[1]
embedding_dim = 64

model = ContentBasedRecommender(input_dim, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

#  Contrastive Loss Function (Fixed)
def contrastive_loss(embeddings):
    """
    Contrastive loss using cosine similarity in the embedding space.
    """
    batch_size = embeddings.shape[0]

    # Randomly select positive and negative samples
    idx = torch.randperm(batch_size)

    positive = embeddings
    negative = embeddings[idx]

    #  Cosine similarities
    sim_pos = F.cosine_similarity(positive, positive)  # Similarities with itself (should be 1)
    sim_neg = F.cosine_similarity(positive, negative)  # Similarities with random negatives

    #  Contrastive loss calculation
    loss = torch.mean(1 - sim_pos + sim_neg)
    return loss

# Training Loop
epochs = 50
batch_size = 64

for epoch in range(epochs):
    model.train()

    for i in range(0, len(X_train_tensor), batch_size):
        batch = X_train_tensor[i:i + batch_size]

        optimizer.zero_grad()

        # Forward pass
        embeddings = model(batch)

        # Contrastive loss (fixed)
        loss = contrastive_loss(embeddings)

        #  Backward pass
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Get Embeddings
model.eval()
train_embeddings = model(X_train_tensor).detach().numpy()
test_embeddings = model(X_test_tensor).detach().numpy()

print("\n Embeddings Generated Successfully!")
print("Train Embeddings Shape:", train_embeddings.shape)
print("Test Embeddings Shape:", test_embeddings.shape)

#  Recommendation System
def recommend_candidates(target_profile, top_n=5):
    """
    Recommend top-N candidates based on similarity to the target profile.
    """
    #  Preprocess Target Profile
    target_numerical = scaler.transform(target_profile[numerical_features])
    target_categorical = encoder.transform(target_profile[categorical_features]).toarray()
    target_preprocessed = np.hstack((target_numerical, target_categorical))
    
    # Convert to PyTorch tensor
    target_tensor = torch.tensor(target_preprocessed, dtype=torch.float32)

    # Generate target embedding
    with torch.no_grad():
        target_embedding = model(target_tensor).detach().numpy()

    # Calculate cosine similarities
    similarities = cosine_similarity(target_embedding, test_embeddings)[0]

    # Rank candidates by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    top_candidates = ranked_indices[:top_n]

    #  Prepare Recommendation DataFrame
    recommendations = pd.DataFrame({
        'Candidate_Index': top_candidates,
        'Similarity_Score': similarities[top_candidates]
    })
    
    return recommendations

#  Sample Target Profile
sample_target = pd.DataFrame({
    'Age': [35],
    'Gender': ['Male'],
    'EducationLevel': ["Master's"],
    'ExperienceYears': [8],
    'PreviousCompanies': [3],
    'DistanceFromCompany': [20],
    'InterviewScore': [75],
    'SkillScore': [80],
    'PersonalityScore': [85],
    'RecruitmentStrategy': ['Aggressive']
})

# Recommend Candidates
recommendations = recommend_candidates(sample_target, top_n=5)
print("\n🔥 Top Recommended Candidates:")
print(recommendations)
