In [None]:
from google import genai
from google.colab import userdata

api_key = userdata.get('gemini_key')

client = genai.Client(api_key=api_key)

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="sup"
)
print(response.text)


In [None]:
!pip install "dvc[gdrive]"

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/Othercomputers/My laptop/Desktop/ResumeRecommenderMLops/data/processed/serpapi/2026-01")
df.head()

In [None]:
df.shape

In [None]:
import pandas as pd
import json
import time
from google import genai
from google.genai import types
from google.colab import userdata
from tqdm import tqdm

api_key = userdata.get('gemini_key')
client = genai.Client(api_key=api_key)

CATEGORIES = [
    "Software Engineer", "Software Developer", "Backend Engineer", "Full Stack Developer",
    "Data Scientist", "Machine Learning Engineer", "Generative AI Engineer", "LLM Engineer",
    "Data Analyst", "Data Engineer", "AI Engineer", "Frontend Developer", "React Developer",
    "Python Developer", "Java Developer", "DevOps Engineer", "MLOps Engineer", "Cloud Engineer",
    "Cloud Security Engineer", "Kubernetes Administrator", "Site Reliability Engineer",
    "Applied Machine Learning Engineer", "NLP Engineer", "Computer Vision Engineer",
    "Platform Engineer", "Analytics Engineer", "Data Architect", "Research Scientist",
    "Deep Learning Engineer", "ML Research Engineer", "FinOps Engineer", "Database Engineer",
    "SQL Developer", "Business Intelligence Engineer", "Mobile Application Developer",
    "Android Developer", "iOS Developer", "Cybersecurity Engineer", "Security Engineer",
    "Embedded Systems Engineer", "Firmware Engineer", "QA Engineer", "Automation Test Engineer",
    "SDET", "Game Developer", "AR/VR Engineer", "Graphics Programmer", "Technical Product Manager",
    "Solutions Engineer", "Product Engineer", "Blockchain Developer", "RPA Developer",
    "Salesforce Developer"
]

In [None]:
def classify_batch_v2(batch_df, model_name="gemini-2.5-flash"):
    # 1. Prepare minimal input (ID + Title + Description)
    # We use the DataFrame index as the ID for the model to map back
    jobs_text = ""
    for idx, row in batch_df.iterrows():
        # Truncate description to 500 chars to save tokens/speed
        desc = str(row['description'])[:500].replace("\n", " ")
        jobs_text += f"ID: {idx} | Title: {row['title']} | Desc: {desc}\n"

    # 2. Define the Schema (The structure we want back)
    # We want a dictionary where keys are strings (IDs) and values are strings (Categories)
    # Note: In JSON, keys must be strings.

    prompt = f"""
    You are an expert job classifier.
    Classify the following 100 jobs into exactly ONE of these categories:
    {json.dumps(CATEGORIES)}

    Instructions:
    - Return a JSON dictionary where the key is the Job ID provided and the value is the Category.
    - Use 'Other' if the job doesn't match any category.
    - Output format: {{ "849": "Data Analyst", "277": "Data Engineer", ... }}

    Jobs to Classify:
    {jobs_text}
    """

    # 3. Call the Model with Structured Output Config
    try:
        response = client.models.generate_content(
            model=model_name,
            contents=prompt,
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                # We specify strict JSON output
            )
        )
        return json.loads(response.text)
    except Exception as e:
        print(f"Error in batch: {e}")
        return {}

In [None]:
# Create a copy to work on
df_labeled = df.copy()

if 'llm_based_category' not in df_labeled.columns:
    df_labeled['llm_based_category'] = None
df_labeled['llm_based_category'] = df_labeled['llm_based_category'].astype(object)

BATCH_SIZE = 100

print(f"Starting classification of {len(df_labeled)} jobs in batches of {BATCH_SIZE}...")

for i in tqdm(range(0, len(df_labeled), BATCH_SIZE)):
    # 1. Slice the batch
    batch = df_labeled.iloc[i : i + BATCH_SIZE]

    # 2. Call Gemini
    # Note: Ensure you use a valid model name. 'gemini-2.0-flash' is the standard.
    # If 'gemini-2.5-flash' is available to you, use that.
    results_dict = classify_batch_v2(batch, model_name="gemini-2.5-flash")

    # 3. Fill the DataFrame
    # The dictionary keys are the INDICES of the dataframe (as strings)
    for idx_str, category in results_dict.items():
        try:
            # Convert string key back to integer index if your DF index is int
            idx = int(idx_str)
            df_labeled.at[idx, 'llm_based_category'] = category
        except Exception as e:
            print(f"Failed to map index {idx_str}: {e}")

    # Sleep to respect rate limits (optional with 2.0 Flash as it's very fast)
    time.sleep(2)

# 4. Save
df_labeled.to_csv("labeled_jobs_100_batch.csv", index=False)
print("Done! Saved to labeled_jobs_100_batch.csv")

# 5. Check missing
missing = df_labeled['llm_based_category'].isnull().sum()
print(f"Jobs left unclassified: {missing}")

In [None]:
df_labeled.head()

In [None]:
import os

# Choose your target path in Drive
save_directory = "/content/drive/Othercomputers/My laptop/Desktop/ResumeRecommenderMLops/data/processed/serpapi"
save_filename = "labeled_jobs.csv"
save_path = os.path.join(save_directory, save_filename)

# Make sure the folder exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save dataframe
df_labeled.to_csv(save_path, index=False)

In [None]:
import json
import time
from google import genai
from google.colab import userdata
from IPython.display import display, JSON

# --- 1. CONFIGURATION & DATA ---

# The full list of categories
CATEGORIES = [
    "Software Engineer", "Software Developer", "Backend Engineer", "Full Stack Developer",
    "Data Scientist", "Machine Learning Engineer", "Generative AI Engineer", "LLM Engineer",
    "Data Analyst", "Data Engineer", "AI Engineer", "Frontend Developer", "React Developer",
    "Python Developer", "Java Developer", "DevOps Engineer", "MLOps Engineer", "Cloud Engineer",
    "Cloud Security Engineer", "Kubernetes Administrator", "Site Reliability Engineer",
    "Applied Machine Learning Engineer", "NLP Engineer", "Computer Vision Engineer",
    "Platform Engineer", "Analytics Engineer", "Data Architect", "Research Scientist",
    "Deep Learning Engineer", "ML Research Engineer", "FinOps Engineer", "Database Engineer",
    "SQL Developer", "Business Intelligence Engineer", "Mobile Application Developer",
    "Android Developer", "iOS Developer", "Cybersecurity Engineer", "Security Engineer",
    "Embedded Systems Engineer", "Firmware Engineer", "QA Engineer", "Automation Test Engineer",
    "SDET", "Game Developer", "AR/VR Engineer", "Graphics Programmer", "Technical Product Manager",
    "Solutions Engineer", "Product Engineer", "Blockchain Developer", "RPA Developer",
    "Salesforce Developer"
]


In [None]:
# The Reference JSON Structure (Minified for prompt efficiency)
JSON_TEMPLATE = """
{
  "job_title": "String",
  "role_category": "String",
  "role_level": ["Junior", "Mid", "Senior"],
  "role_summary": "Detailed description string",
  "primary_responsibilities": ["List of strings"],
  "technical_skills": {
    "frontend": [], "backend": [], "databases": [], "version_control": [], "deployment": []
  },
  "programming_languages": ["List of strings"],
  "frameworks_and_libraries": ["List of strings"],
  "tools_and_platforms": ["List of strings"],
  "non_technical_skills": ["List of strings"],
  "educational_background": { "preferred_degree": [], "degree_required": boolean },
  "experience_requirements": { "junior": "String", "mid": "String", "senior": "String" },
  "common_projects": ["List of strings"],
  "interview_topics": ["List of strings"],
  "common_tools_in_interviews": ["List of strings"],
  "career_progression": ["List of strings"],
  "related_roles": ["List of strings"],
  "industry_domains": ["List of strings"],
  "salary_range_in_india": { "junior": "String", "mid": "String", "senior": "String" },
  "resume_keywords": ["List of strings"],
  "skill_taxonomy": { "must_have": [], "good_to_have": [], "domain_specific": [] },
  "complexity_signals": { "high_impact_keywords": [], "valuable_certifications": [] },
  "alternative_titles": ["List of strings"]
}
"""


In [None]:
def setup_client():
    """Initializes the Gemini client using Colab userdata."""
    try:
        api_key = userdata.get('gemini_key')
        return genai.Client(api_key=api_key)
    except Exception as e:
        print(f"Error: Could not retrieve API key. Make sure 'gemini_key' is set in Colab secrets. \nDetails: {e}")
        return None

def clean_json_response(response_text):
    """
    Cleans the model response to extract the JSON list.
    Removes Markdown code fences (```json ... ```).
    """
    cleaned_text = response_text.strip()
    if cleaned_text.startswith("```json"):
        cleaned_text = cleaned_text[7:]
    if cleaned_text.startswith("```"):
        cleaned_text = cleaned_text[3:]
    if cleaned_text.endswith("```"):
        cleaned_text = cleaned_text[:-3]
    return cleaned_text.strip()

def process_batch(client, batch_titles, template):
    """
    Sends a batch of titles to Gemini and requests a detailed JSON list.
    """
    prompt = f"""
    You are an expert technical recruiter and engineering manager.
    I will provide a list of job titles. For EACH title, generate a DETAILED JSON object following the exact structure provided below.

    CRITICAL INSTRUCTIONS:
    1. Return a valid JSON List `[...]` containing one object for every title in the input list.
    2. Do not omit any fields. Fill arrays with at least 5-7 relevant items.
    3. Ensure technical accuracy (e.g., dont list React as a skill for an Embedded Engineer).
    4. Return ONLY the JSON. No conversational text.

    ### INPUT JOB TITLES:
    {json.dumps(batch_titles)}

    ### REQUIRED JSON STRUCTURE:
    {template}
    """

    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash", # Using 2.0 Flash for speed/cost or Pro for quality
            contents=prompt,
            config={"response_mime_type": "application/json"} # Enforce JSON mode
        )
        return json.loads(clean_json_response(response.text))
    except Exception as e:
        print(f"‚ö†Ô∏è Error processing batch {batch_titles}: {e}")
        return []



In [None]:
# --- 3. MAIN EXECUTION LOOP ---

def main():
    client = setup_client()
    if not client: return

    BATCH_SIZE = 10
    all_job_descriptions = []

    total_categories = len(CATEGORIES)
    print(f"üöÄ Starting generation for {total_categories} categories...")

    for i in range(0, total_categories, BATCH_SIZE):
        batch = CATEGORIES[i : i + BATCH_SIZE]
        print(f"   Processing batch {i//BATCH_SIZE + 1}: {batch[0]} ... {batch[-1]}")

        # Call the API
        batch_results = process_batch(client, batch, JSON_TEMPLATE)

        if batch_results:
            all_job_descriptions.extend(batch_results)
            print(f"   ‚úÖ Successfully generated {len(batch_results)} roles.")
        else:
            print("   ‚ùå Failed to generate batch.")

        # Respect rate limits (optional, but good practice)
        time.sleep(2)

    # --- 4. SAVE OUTPUT ---
    output_filename = "detailed_job_descriptions.json"
    with open(output_filename, "w") as f:
        json.dump(all_job_descriptions, f, indent=2)

    print(f"\nüéâ Completed! Generated {len(all_job_descriptions)} descriptions.")
    print(f"üìÅ Data saved to '{output_filename}'")

    # Optional: Preview first item
    if all_job_descriptions:
        print("\n--- Preview of first item ---")
        print(json.dumps(all_job_descriptions[0], indent=2))

if __name__ == "__main__":
    main()

In [None]:
source_file = "detailed_job_descriptions.json"

# Your target directory on Drive
target_dir = "/content/drive/Othercomputers/My laptop/Desktop/ResumeRecommenderMLops/data/constants/KB"
target_file = os.path.join(target_dir, "detailed_job_descriptions.json")

In [None]:
import shutil
import os

if not os.path.exists(target_dir):
    print(f"üìÇ Creating directory: {target_dir}")
    os.makedirs(target_dir, exist_ok=True)

# 4. Check if the source file exists (from the previous run)
if os.path.exists(source_file):
    print(f"üöö Moving file from Colab runtime to Drive...")

    # Copy the file
    shutil.copy2(source_file, target_file)

    print(f"‚úÖ Success! File saved to: {target_file}")

    # Verify file size to ensure it's not empty
    size = os.path.getsize(target_file)
    print(f"üìä File size: {size / 1024:.2f} KB")

else:
    print(f"‚ùå Error: Could not find '{source_file}' in the current directory.")
    print("Please make sure the generation script has finished running completely before running this cell.")

PROCESSING MISSING PARTS OF THE FILLED JSON IN CASE SOME CATEGORIES ARE MISSING WE GO THROUGH THE JSON AND REFILL IT

In [None]:
import json
import time
import os
import random
from google import genai
from google.colab import userdata
from google.colab import drive

# --- 1. SETUP & PATHS ---
drive.mount('/content/drive')

# Exact path you provided
SAVE_DIR = "/content/drive/Othercomputers/My laptop/Desktop/ResumeRecommenderMLops/data/constants/KB"
FILE_PATH = os.path.join(SAVE_DIR, "detailed_job_descriptions.json")

# The Master List
ALL_CATEGORIES = [
    "Software Engineer", "Software Developer", "Backend Engineer", "Full Stack Developer",
    "Data Scientist", "Machine Learning Engineer", "Generative AI Engineer", "LLM Engineer",
    "Data Analyst", "Data Engineer", "AI Engineer", "Frontend Developer", "React Developer",
    "Python Developer", "Java Developer", "DevOps Engineer", "MLOps Engineer", "Cloud Engineer",
    "Cloud Security Engineer", "Kubernetes Administrator", "Site Reliability Engineer",
    "Applied Machine Learning Engineer", "NLP Engineer", "Computer Vision Engineer",
    "Platform Engineer", "Analytics Engineer", "Data Architect", "Research Scientist",
    "Deep Learning Engineer", "ML Research Engineer", "FinOps Engineer", "Database Engineer",
    "SQL Developer", "Business Intelligence Engineer", "Mobile Application Developer",
    "Android Developer", "iOS Developer", "Cybersecurity Engineer", "Security Engineer",
    "Embedded Systems Engineer", "Firmware Engineer", "QA Engineer", "Automation Test Engineer",
    "SDET", "Game Developer", "AR/VR Engineer", "Graphics Programmer", "Technical Product Manager",
    "Solutions Engineer", "Product Engineer", "Blockchain Developer", "RPA Developer",
    "Salesforce Developer"
]

JSON_TEMPLATE = """
{
  "job_title": "String",
  "role_category": "String",
  "role_level": ["Junior", "Mid", "Senior"],
  "role_summary": "Detailed description string",
  "primary_responsibilities": ["List of strings"],
  "technical_skills": {
    "frontend": [], "backend": [], "databases": [], "version_control": [], "deployment": []
  },
  "programming_languages": ["List of strings"],
  "frameworks_and_libraries": ["List of strings"],
  "tools_and_platforms": ["List of strings"],
  "non_technical_skills": ["List of strings"],
  "educational_background": { "preferred_degree": [], "degree_required": boolean },
  "experience_requirements": { "junior": "String", "mid": "String", "senior": "String" },
  "common_projects": ["List of strings"],
  "interview_topics": ["List of strings"],
  "common_tools_in_interviews": ["List of strings"],
  "career_progression": ["List of strings"],
  "related_roles": ["List of strings"],
  "industry_domains": ["List of strings"],
  "salary_range_in_india": { "junior": "String", "mid": "String", "senior": "String" },
  "resume_keywords": ["List of strings"],
  "skill_taxonomy": { "must_have": [], "good_to_have": [], "domain_specific": [] },
  "complexity_signals": { "high_impact_keywords": [], "valuable_certifications": [] },
  "alternative_titles": ["List of strings"]
}
"""

# --- 2. HELPER FUNCTIONS ---

def clean_json_response(response_text):
    text = response_text.strip()
    if text.startswith("```json"): text = text[7:]
    if text.startswith("```"): text = text[3:]
    if text.endswith("```"): text = text[:-3]
    return text.strip()

def generate_with_backoff(client, batch_titles, retries=3):
    """Tries to generate content, retrying on 503 errors with exponential backoff."""
    prompt = f"""
    You are an expert technical recruiter.
    Generate a JSON List of objects for these job titles: {json.dumps(batch_titles)}
    Follow this structure exactly:
    {JSON_TEMPLATE}
    Return ONLY valid JSON.
    """

    for attempt in range(retries):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt,
                config={"response_mime_type": "application/json"}
            )
            return json.loads(clean_json_response(response.text))
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1) # 1s, 2s, 4s...
            print(f"   ‚ö†Ô∏è Attempt {attempt+1} failed ({e}). Retrying in {wait_time:.1f}s...")
            time.sleep(wait_time)

    print(f"   ‚ùå Failed to generate batch {batch_titles} after {retries} attempts.")
    return []

# --- 3. MAIN LOGIC ---

def main():
    # A. Load Existing Data
    if os.path.exists(FILE_PATH):
        try:
            with open(FILE_PATH, "r") as f:
                existing_data = json.load(f)
            print(f"üìÇ Loaded {len(existing_data)} existing roles from Drive.")
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Error reading existing file. Starting fresh.")
            existing_data = []
    else:
        print("üìÇ No existing file found. Starting fresh.")
        existing_data = []

    # B. Identify Missing Categories
    # We normalize to lowercase for comparison to avoid case-sensitivity issues
    existing_titles = {item.get("job_title", "").lower() for item in existing_data}

    missing_categories = [
        cat for cat in ALL_CATEGORIES
        if cat.lower() not in existing_titles
    ]

    if not missing_categories:
        print("‚úÖ All categories are already present! No action needed.")
        return

    print(f"üîç Found {len(missing_categories)} missing categories: {missing_categories}")

    # C. Initialize Client
    try:
        api_key = userdata.get('gemini_key')
        client = genai.Client(api_key=api_key)
    except Exception as e:
        print("‚ùå API Key error.")
        return

    # D. Generation Loop (Only for Missing)
    BATCH_SIZE = 5 # Reduced batch size slightly for better stability on retry
    new_data = []

    for i in range(0, len(missing_categories), BATCH_SIZE):
        batch = missing_categories[i : i + BATCH_SIZE]
        print(f"   Processing retry batch {i//BATCH_SIZE + 1}: {batch}")

        batch_result = generate_with_backoff(client, batch)

        if batch_result:
            new_data.extend(batch_result)
            print(f"   ‚úÖ Recovered {len(batch_result)} roles.")

        # Polite delay
        time.sleep(2)

    # E. Merge and Save
    if new_data:
        combined_data = existing_data + new_data

        # Sort for neatness (optional)
        combined_data.sort(key=lambda x: x.get("job_title", ""))

        with open(FILE_PATH, "w") as f:
            json.dump(combined_data, f, indent=2)

        print(f"\nüéâ Success! Added {len(new_data)} missing roles.")
        print(f"üíæ Total roles in file: {len(combined_data)}")
        print(f"üìÅ Updated file saved to: {FILE_PATH}")
    else:
        print("\n‚ö†Ô∏è No new data was generated during retry.")

if __name__ == "__main__":
    main()

In [None]:
from google.colab import files

files.download(FILE_PATH)
