# Random Data Generation

In [1]:
import pandas as pd
import random

# Define variations for synthetic data
roles = ["ICU Nurse", "ER Nurse", "Pediatric Nurse", "General Nurse", "Senior Nurse", "Junior Nurse"]
skills_pool = ["ICU", "Emergency Care", "Pediatrics", "Patient Care", "BLS/ACLS", "IV Therapy"]

data = []

for i in range(1000):
    # Randomly assign years of experience
    exp = random.randint(0, 20)
    
    # Randomly pick a job description
    jd = random.choice(roles)
    
    # Generate Resume Text based on experience and roles
    if exp < 2:
        resume = f"Fresh graduate nurse, specialized in {random.choice(skills_pool)}"
    else:
        resume = f"{random.choice(['Experienced', 'Professional'])} {random.choice(roles)} with {exp} years in {random.choice(skills_pool)}"
    
    # Logic for 'Selected' (1 = Hire, 0 = Reject)
    # Higher chance if experience > 3 and JD matches Resume keywords
    selected = 0
    if "Senior" in jd and exp >= 8:
        selected = 1
    elif "ICU" in jd and "ICU" in resume and exp >= 3:
        selected = 1
    elif exp > 5 and random.random() > 0.4: # Random selection for general high experience
        selected = 1
    else:
        selected = random.choice([0, 0, 0, 1]) # Lower chance for others

    data.append([resume, jd, exp, selected])

# Create DataFrame
df_1000 = pd.DataFrame(data, columns=["resume_text", "job_description", "experience_years", "selected"])

# Save to CSV
df_1000.to_csv("AI_Resume_Screening_1000.csv", index=False)
print("Successfully generated 1000 records in AI_Resume_Screening_1000.csv")

Successfully generated 1000 records in AI_Resume_Screening_1000.csv


In [2]:
# MODEL IMPLEMENTATION

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# Load data
df = pd.read_csv("AI_Resume_Screening_1000.csv")

# Combine resume + job description
df["text"] = df["resume_text"] + " " + df["job_description"]

X_text = df["text"]
X_exp = df[["experience_years"]]
y = df["selected"]

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(X_text)

# Combine numeric + text features
X = hstack([X_tfidf, X_exp.values])

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

prediction = model.predict(X_test)[0]
probability = model.predict_proba(X_test)[0][1]

print("shortlisted", bool(prediction))
print("match_score", round(probability, 2))

              precision    recall  f1-score   support

           0       0.71      0.57      0.63        86
           1       0.72      0.82      0.77       114

    accuracy                           0.71       200
   macro avg       0.71      0.70      0.70       200
weighted avg       0.71      0.71      0.71       200

shortlisted False
match_score 0.42


In [14]:
# API FOR HR SYSTEM

In [13]:
import asyncio
import uvicorn

if __name__ == "__main__":
    config = uvicorn.Config(app)
    server = uvicorn.Server(config)
    await server.serve()


from fastapi import FastAPI
import pickle

app = FastAPI()

model = pickle.load(open("model.pkl", "rb"))
tfidf = pickle.load(open("tfidf.pkl", "rb"))

@app.post("/predict")
def predict(resume_text: str, job_description: str, experience: int):
    combined_text = resume_text + " " + job_description
    text_vec = tfidf.transform([combined_text])
    
    features = hstack([text_vec, [[experience]]])
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0][1]

    return {
        "shortlisted": bool(prediction),
        "match_score": round(probability, 2)
    }

ModuleNotFoundError: No module named 'uvicorn'