In [140]:
!pip install pdfplumber




In [141]:
!pip install fpdf




----
----

# SYNTHETIC DATASET GENERATION

In [142]:
import os
from fpdf import FPDF
from email.message import EmailMessage

# Create directory structure
directories = ["data/resumes", "data/emails", "data/videos", "data/audios"]
for d in directories:
    os.makedirs(d, exist_ok=True)

# Define synthetic resume texts for five candidates.
resume_texts = [
    "John Doe has 5 years of experience in machine learning. Proficient in Python, NLP, and deep learning. Graduated from MIT.",
    "Jane Smith has 3 years of experience in data science. Proficient in Python, R, and SQL. Graduated from Stanford University.",
    "David Lee has 7 years of experience in software engineering, specializing in machine learning and AI. Proficient in Python and TensorFlow. Graduated from Harvard.",
    "Emily Carter has 4 years of experience in natural language processing. Proficient in Python, NLP, and deep learning. Graduated from Berkeley.",
    "Michael Brown has 6 years of experience in data analytics. Proficient in Python, data visualization, and machine learning. Graduated from Oxford."
]

# Generate synthetic PDF resumes.
for i, text in enumerate(resume_texts):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)
    resume_path = os.path.join("data", "resumes", f"resume_{i}.pdf")
    pdf.output(resume_path)
    print(f"Created {resume_path}")

# Generate synthetic email files.
for i in range(len(resume_texts)):
    msg = EmailMessage()
    msg["Subject"] = "Application for Software Engineer Role"
    msg["From"] = "candidate@example.com"
    msg["To"] = "hr@company.com"
    # Slightly vary the email body for each candidate.
    body_text = f"Dear HR,\n\nI am excited to apply for the software engineer role. My background includes extensive relevant experience. Regards, Candidate {i}"
    msg.set_content(body_text)
    email_path = os.path.join("data", "emails", f"email_{i}.eml")
    with open(email_path, "wb") as f:
        f.write(msg.as_bytes())
    print(f"Created {email_path}")


Created data/resumes/resume_0.pdf
Created data/resumes/resume_1.pdf
Created data/resumes/resume_2.pdf
Created data/resumes/resume_3.pdf
Created data/resumes/resume_4.pdf
Created data/emails/email_0.eml
Created data/emails/email_1.eml
Created data/emails/email_2.eml
Created data/emails/email_3.eml
Created data/emails/email_4.eml


-----
-----

In [143]:
import re

In [144]:
import os

In [145]:
import spacy

In [146]:
import pdfplumber

In [147]:
import torch

In [148]:
import cv2

In [149]:
import librosa

In [150]:
import numpy as np

In [151]:
import pandas as pd

In [152]:
from sentence_transformers import SentenceTransformer, util

In [153]:
from email import policy

In [154]:
from email.parser import BytesParser

In [155]:
from transformers import pipeline

In [156]:
from sklearn.ensemble import RandomForestClassifier

In [157]:
from sklearn.preprocessing import StandardScaler

# Set up models and prefer GPU in Jupyter

In [158]:
spacy.prefer_gpu()

True

In [159]:
nlp = spacy.load("en_core_web_sm")



In [160]:
# Specify model and revision for production-grade sentiment analysis

In [161]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    revision="714eb0f"
)

Device set to use cuda:0


In [162]:
# Load a SentenceTransformer model for semantic (BERT) matching

In [163]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Define a domain-specific skill list#

In [164]:
skill_list = {"python", "machine learning", "nlp", "data science", "deep learning", "tensorflow", "pytorch"}


# Helper Functions

In [165]:
# Extract text from PDF resumes. Returns None if file is missing or unreadable.


In [166]:
def extract_text_from_pdf(pdf_path):
    if os.path.exists(pdf_path):
        try:
            with pdfplumber.open(pdf_path) as pdf:
                texts = [page.extract_text() for page in pdf.pages if page.extract_text()]
                return " ".join(texts)
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")
            return None
    else:
        print(f"File {pdf_path} not found.")
        return None

In [167]:
# Returns resume text for a given candidate index from the real dataset.

In [168]:
def get_resume_text(index, folder="data/resumes"):
    file_path = os.path.join(folder, f"resume_{index}.pdf")
    return extract_text_from_pdf(file_path)

In [169]:
# Use spaCy to extract candidate details from resume text.

In [170]:
def extract_details(text):
    doc = nlp(text)
    details = {
        "name": next((ent.text for ent in doc.ents if ent.label_ == "PERSON"), "Unknown"),
        "email": re.findall(r"[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+", text),
        "phone": re.findall(r"\+?\d[\d -]{8,}\d", text),
        "skills": [token.text.lower() for token in doc if token.text.lower() in skill_list],
        "education": [ent.text for ent in doc.ents if ent.label_ == "ORG" and "university" in ent.text.lower()],
        "experience": re.findall(r"\b\d+\s+years?\b", text),
        "projects": re.findall(r"project[s]?: (.+)", text, re.IGNORECASE),
        "certifications": re.findall(r"certified in (.+)", text, re.IGNORECASE)
    }
    return details

In [171]:
# Parse email details from a file in the dataset.

In [172]:
def get_email_data(index, folder="data/emails"):
    file_path = os.path.join(folder, f"email_{index}.eml")
    if os.path.exists(file_path):
        try:
            with open(file_path, "rb") as f:
                raw_email = f.read()
            email_data = BytesParser(policy=policy.default).parsebytes(raw_email)
            body = email_data.get_payload()
            return {
                "subject": email_data["subject"],
                "sender": email_data["from"],
                "recipient": email_data["to"],
                "body": body,
                "sentiment": sentiment_analyzer(body[:512])[0]['label'] if body else "Neutral"
            }
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None
    else:
        print(f"Email file {file_path} not found.")
        return None

In [173]:
# Use BERT embeddings to compute cosine similarity between job description and each resume.


In [174]:
def match_resume_to_job(resume_texts, job_desc):
    job_embedding = bert_model.encode(job_desc, convert_to_tensor=True)
    resume_embeddings = bert_model.encode(resume_texts, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(job_embedding, resume_embeddings).squeeze().tolist()
    return similarity_scores

In [175]:
# AI-driven adaptive testing (placeholder returning a random score)

In [176]:
def adaptive_test_performance():
    return np.random.randint(70, 100)

In [177]:
# Video analysis placeholder.

In [178]:
def analyze_video(index):
    video_path = os.path.join("data", "videos", f"video_{index}.mp4")
    if os.path.exists(video_path):
        # Insert actual video analysis code here.
        return {"emotion": "neutral", "face_detected": True}
    else:
        # Log and return a default result.
        print(f"Video file {video_path} not found.")
        return {"emotion": "neutral", "face_detected": True}

In [179]:
# Speech analysis placeholder.

In [180]:
def analyze_speech(index):
    audio_path = os.path.join("data", "audios", f"audio_{index}.wav")
    if os.path.exists(audio_path):
        try:
            y, sr = librosa.load(audio_path)
            return {"pitch_mean": np.mean(y), "speech_rate": len(y) / sr}
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            return {"pitch_mean": None, "speech_rate": None}
    else:
        print(f"Audio file {audio_path} not found.")
        return {"pitch_mean": np.random.uniform(0.5, 1.5), "speech_rate": np.random.uniform(3, 6)}


In [181]:
# Synthetic personality assessment placeholder.

In [182]:
def assess_personality():
    return {"extroversion": np.random.uniform(0, 1), "openness": np.random.uniform(0, 1)}


# Hiring Prediction Model (Synthetic Training)

In [183]:
hiring_model = RandomForestClassifier()

In [184]:
scaler = StandardScaler()

In [185]:
X_train = np.random.rand(20, 4)

In [186]:
y_train = np.random.randint(0, 2, 20)

In [187]:
hiring_model.fit(X_train, y_train)

In [188]:
def compute_candidate_score(details, similarity_score, email):
    score = (len(details["skills"]) * 2) + (len(details["education"]) * 3) + (len(details["experience"]) * 3)
    if email and any(word in email["subject"].lower() for word in ["education", "startup", "teaching"]):
        score += 5
    score += similarity_score * 10  # Scale similarity to a 10-point range.
    if email and email["sentiment"] == "POSITIVE":
        score += 5
    return score

In [189]:
def predict_hiring_outcome(candidate_features):
    candidate_features_scaled = scaler.fit_transform(np.array(candidate_features).reshape(1, -1))
    return hiring_model.predict(candidate_features_scaled)[0]


# Main Pipeline

In [190]:
# Determine the number of candidates based on the number of resume files.

In [191]:
from glob import glob

In [192]:
resume_files = glob(os.path.join("data", "resumes", "*.pdf"))

In [193]:
num_candidates = len(resume_files)

In [194]:
# Retrieve resumes from the dataset.

In [195]:
resumes = []
for i in range(num_candidates):
    text = get_resume_text(i)
    if text:
        resumes.append(text)
    else:
        print(f"Skipping candidate {i} due to missing resume.")


# job_description

In [196]:
job_description = "Looking for a software engineer with Python and NLP experience."


In [197]:
# Extract candidate details and compute semantic similarity scores.

In [198]:
candidate_details = [extract_details(text) for text in resumes]

In [199]:
similarity_scores = match_resume_to_job(resumes, job_description)

In [200]:
final_candidates = []
for i, details in enumerate(candidate_details):
    email_data = get_email_data(i)
    if email_data is None:
        print(f"Skipping candidate {i} due to missing email data.")
        continue
    score = compute_candidate_score(details, similarity_scores[i], email_data)
    adaptive_test = adaptive_test_performance()
    video_analysis = analyze_video(i)
    speech_analysis = analyze_speech(i)
    personality = assess_personality()
    prediction = predict_hiring_outcome([score, adaptive_test, personality["extroversion"], personality["openness"]])

    final_candidates.append({
        "name": details["name"],
        "score": score,
        "adaptive_test": adaptive_test,
        "video_analysis": video_analysis["emotion"],
        "speech_analysis": speech_analysis["pitch_mean"],
        "personality": personality,
        "prediction": "Hired" if prediction == 1 else "Not Hired"
    })


Video file data/videos/video_0.mp4 not found.
Audio file data/audios/audio_0.wav not found.
Video file data/videos/video_1.mp4 not found.
Audio file data/audios/audio_1.wav not found.
Video file data/videos/video_2.mp4 not found.
Audio file data/audios/audio_2.wav not found.
Video file data/videos/video_3.mp4 not found.
Audio file data/audios/audio_3.wav not found.
Video file data/videos/video_4.mp4 not found.
Audio file data/audios/audio_4.wav not found.


In [201]:
# Rank all candidates from best to worst based on their score.

In [202]:
ranking_df = pd.DataFrame(final_candidates)

In [203]:
ranking_df.sort_values(by="score", ascending=False, inplace=True)

In [204]:
# Export the full ranking to a CSV file.

In [205]:
ranking_df.to_csv("all_candidates_ranked.csv", index=False)

In [206]:
print("All Candidates Ranked (Best to Worst):")
print(ranking_df)

All Candidates Ranked (Best to Worst):
            name      score  adaptive_test video_analysis  speech_analysis  \
1     Jane Smith  17.940599             96        neutral         1.096318   
0       John Doe  17.511751             92        neutral         1.391802   
2      David Lee  17.503206             90        neutral         1.310140   
3         Carter  17.426757             71        neutral         1.424451   
4  Michael Brown  14.316690             96        neutral         0.746340   

                                         personality prediction  
1  {'extroversion': 0.06978081483291709, 'opennes...      Hired  
0  {'extroversion': 0.9406700032229598, 'openness...      Hired  
2  {'extroversion': 0.8666597333223305, 'openness...      Hired  
3  {'extroversion': 0.05908215874732614, 'opennes...      Hired  
4  {'extroversion': 0.7268602688700576, 'openness...      Hired  


In [207]:
# The best candidate is at the top of the ranking.

In [208]:
best_candidate = ranking_df.iloc[0]

In [209]:
print("\nThe best candidate selected based on your job standards is:")
print(best_candidate)


The best candidate selected based on your job standards is:
name                                                      Jane Smith
score                                                      17.940599
adaptive_test                                                     96
video_analysis                                               neutral
speech_analysis                                             1.096318
personality        {'extroversion': 0.06978081483291709, 'opennes...
prediction                                                     Hired
Name: 1, dtype: object
