In [0]:
import os
import pandas as pd

curr_dir = os.getcwd()
data_path = os.path.join(curr_dir, "data/")
questions_data_path = os.path.join(data_path, "questions_and_answers/")

In [0]:
import requests
from bs4 import BeautifulSoup

def clean_html_content(html_content):
    """
    Remove HTML tags and normalize text from the given HTML content.
    Handles missing or invalid content gracefully.
    """
    if not isinstance(html_content, str):
        # If content is not a string (e.g., NaN)
        return
    
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract plain text
    clean_text = soup.get_text(separator=" ")
    
    # Normalize whitespace
    clean_text = " ".join(clean_text.split())
    return clean_text

def get_solution_url(problem_number):
    """
    Determine the GitHub solution file URL based on the problem number.
    """
    base_url = "https://github.com/fishercoder1534/Leetcode/blob/master/src/main/java/com/fishercoder/solutions/"
    
    # Determine the folder
    if problem_number < 1000:
        folder = "firstthousand"
    elif problem_number < 2000:
        folder = "secondthousand"
    elif problem_number < 3000:
        folder = "thirdthousand"
    else:
        folder = "fourththousand"
    
    # Construct the URL
    return f"{base_url}{folder}/_{problem_number}.java"


def fetch_solution(url):
    """
    Fetch the raw content of the solution from the GitHub file URL.
    """
    try:
        # Convert the GitHub URL to the raw content URL
        raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
        
        # Fetch the solution content
        response = requests.get(raw_url)
        if response.status_code == 200:
            return response.text
        else:
            # Failed to fetch solution - ignore
            return None
    except Exception as e:
        print(f"Error fetching solution from {url}: {e}")
        return None


def unify_leetcode_datasets():
    # Load the LeetCode problems dataset
    leetcode_problem_content = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems_data.csv")) \
        .drop(columns=["title", "likes", "dislikes"]) \
        .rename(columns={"slug": "formatted_title",
                         "content": "question"}) \
        .assign(formatted_title=lambda df: df["formatted_title"].apply(lambda x: x.lower()))
    leetcode_problem_content["question"] = leetcode_problem_content["question"].apply(clean_html_content)
    # Original columns: question_id,title,content,difficulty,likes,dislikes,slug.
    # New columns: question_id,formatted_title,question,difficulty.

    leetcode_problem_meta = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems_metadata.csv")) \
        .drop(columns=["page_number", "is_premium", "title", "accepted", "submission", "solution", "discussion_count", "likes", "dislikes"]) \
        .assign(problem_URL=lambda df: df["problem_URL"].apply(lambda x: x.split("/")[-1])) \
        .rename(columns={"id": "question_id",
                         "problem_description": "question",
                         "problem_URL": "formatted_title",
                         "topic_tags": "topics"})
    # Original columns: id,page_number,is_premium,title,problem_description,topic_tags,difficulty,similar_questions,no_similar_questions,acceptance,accepted,submission,solution,discussion_count,likes,dislikes,problem_URL,solution_URL.
    # New columns: question_id,question,topics,difficulty,similar_questions,no_similar_questions,acceptance, formatted_title,solution_URL.

    leetcode_links = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems&solutions_links.csv")) \
        .drop(columns=["name"]) \
        .assign(problem_URL=lambda df: df["link"].apply(lambda x: x.split("/")[-2])) \
        .rename(columns={"link": "formatted_title",
                         "solution": "solution_URL"})
    # Original columns: name,link,difficulty,solution.
    # New columns: formatted_title,difficulty,solution_URL.
    
    # Merge datasets
    leetcode_combined = pd.merge(leetcode_problem_content, leetcode_problem_meta,
                                 on=["question_id", "formatted_title", "question", "difficulty"],
                                 how="outer")
    
    # Add a column for the solution
    leetcode_combined["solution"] = None
    
    # Fetch solutions
    for index, row in leetcode_combined.iterrows():
        problem_id = row["question_id"]
        if not pd.isna(problem_id):
            solution_url = get_solution_url(int(problem_id))
            solution_content = fetch_solution(solution_url)
            leetcode_combined.at[index, "solution"] = solution_content
    
    # Save the updated dataset
    leetcode_combined.to_csv(os.path.join(questions_data_path, "all_code_problems_with_solutions.csv"), index=False)
    print("Unified code questions dataset saved with solutions included.")
    
    return leetcode_combined

leetcode_with_solutions = unify_leetcode_datasets()
leetcode_with_solutions.display()

In [0]:
import glob

def unify_open_questions_datasets():  
    """ Load and preprocess the open-ended question datasets. """
    data_science_questions = pd.read_csv(os.path.join(questions_data_path, "open_questions_data_science.csv")) \
        .rename(columns={"DESCRIPTION": "question",
                         "ID": "question_id"})
    # Original columns: ID,DESCRIPTION
    # New columns: question_id,question
    num_ds_rows = len(data_science_questions)

    general_questions = pd.read_csv(os.path.join(questions_data_path, "general_open_questions.csv"))
    general_questions["question_id"] = num_ds_rows + general_questions.index + 1
    # Original columns: question
    # New columns: question_id,question
    
    data_science_questions["category"] = "Data Science"
    data_science_questions["topics"] = "Data Science"
    general_questions["category"] = "General"
    general_questions["topics"] = "Soft Skills"
    
    # Concatenate both into one database
    open_questions_df = pd.concat([data_science_questions, general_questions], ignore_index=True)
    
    # Save the unified database
    open_questions_df.to_csv(os.path.join(questions_data_path, "all_open_questions.csv"), index=False)
    print("Unified database created and saved.")
    
    return open_questions_df

open_questions_df = unify_open_questions_datasets()
open_questions_df.display()

In [0]:
jobs_path = os.path.join(data_path, "jobs/")

In [0]:
seniority_mapping = {
    "Not Applicable": 0,
    "Non pertinent": 0,
    "Di-angkop": 0,
    "Stagiaire / Alternant": 1,
    "Internship": 1,
    "Entry level": 2,
    "Premier emploi": 2,
    "Associate": 3,
    "Mid-Senior level": 4,
    "Confirmé": 4,
    "Manager": 5,
    "Director": 6,
    "Executive": 7,
}

def unify_jobpostings_datasets():
    job_descriptions_and_skills = pd.read_csv(os.path.join(jobs_path, "job_descriptions_and_skills.csv")) \
        .rename(columns={"category": "field", "job_description": "job_summary", "job_skill_set": "skills"})
    # Original columns: job_id,category,job_title,job_description,job_skill_set
    # New columns: job_id,field,job_title,job_summary,skills

    linkedin_hightech_jobs = pd.read_csv(os.path.join(jobs_path, "linkedin_hightech_jobs.csv")) \
        .drop(columns=["url", "company_id", "job_location", "job_employment_type", "job_base_pay_range", "company_url", "job_posted_time", "job_num_applicants", "discovery_input"]) \
        .rename(columns={"job_posting_id": "job_id",
                         "job_function": "field",
                         "job_industries": "company_industry"})
    linkedin_hightech_jobs["level"] = linkedin_hightech_jobs["job_seniority_level"].map(seniority_mapping) \
        .drop(columns=["job_seniority_level"])
    # Original columns: url,job_posting_id,job_title,company_name,company_id,job_location,job_summary,apply_link,job_seniority_level,job_function,job_employment_type,job_industries,job_base_pay_range,company_url,job_posted_time,job_num_applicants,discovery_input
    # New columns: job_id,job_title,company_name,job_summary,apply_link,level,field,company_industry

    indeed_jobs = pd.read_csv(os.path.join(jobs_path, "indeed_jobs.csv")) \
        .drop(columns=["JOB_URL", "DATE_OF_POSTING", "WEBSITE", "SALARY", "REMOTE", "CITIES", "STATE", "COUNTRY", "JOB_TYPE", "ZIPCODE", "WEBSITEPOSTING"]) \
        .rename(columns={"JOB_TITLE": "job_title",
                         "COMPANY": "company_name",
                         "INDUSTRY": "company_industry",
                         "JOB_DESCRIPTION": "job_summary"})
    # Original columns: JOB_URL,DATE_OF_POSTING,JOB_TITLE,COMPANY,WEBSITE,INDUSTRY,SALARY,REMOTE,CITIES,STATE,COUNTRY,JOB_TYPE,ZIPCODE,JOB_DESCRIPTION,WEBSITEPOSTING
    # New columns: job_title,company_name,company_industry,job_summary

    glassdoor_data_jobs = pd.read_csv(os.path.join(jobs_path, "glassdoor_data_jobs_and_company_info.csv")) \
        .drop(columns=["Salary Estimate", "Rating", "Location", "Size", "Founded", "Type of ownership", "Revenue"]) \
        .rename(columns={"Job Title": "job_title",
                         "Job Description": "job_summary",
                         "Company Name": "company_name",
                         "Industry": "company_industry",
                         "Sector": "field"})
    glassdoor_data_jobs.replace("-1", None, inplace=True)
    glassdoor_data_jobs["company_name"] = glassdoor_data_jobs["company_name"].apply(lambda x: x.split("\n")[0])
    # Original columns: Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
    # New columns: job_title,job_summary,company_name,company_industry,field

    linkedin_data_jobs = pd.read_csv(os.path.join(jobs_path, "linkedin_data_jobs.csv")) \
        .drop(columns=["Employment type", "company_id", "context", "date", "education", "location", "months_experience", "sal_high", "sal_low", "salary"]) \
        .rename(columns={"Industries": "company_industry",
                         "Job function": "field",
                         "company": "company_name",
                         "description": "job_summary",
                         "post_id": "job_id",
                         "post_url": "post_link",
                         "title": "job_title"})
    linkedin_data_jobs["level"] = linkedin_data_jobs["Seniority level"].map(seniority_mapping)
    # Original columns: Employment type, Industries, Job function, Seniority level, company, company_id, context, date, description, education, location, months_experience, post_id, post_url, sal_high, sal_low, salary, title
    # New columns: company_industry,field,level,company_name,job_summary,job_id,post_link,job_title

    merge1 = pd.merge(job_descriptions_and_skills, linkedin_hightech_jobs,
                             on=["job_id","field","job_title","job_summary"],
                             how="outer")
    
    merge2 = pd.merge(merge1, indeed_jobs,
                      on=["job_title","company_name","company_industry","job_summary"],
                      how="outer")
    
    merge3 = pd.merge(merge2, glassdoor_data_jobs,
                      on=["job_title","job_summary","company_name","company_industry","field"],
                      how="outer")
    
    merge4 = pd.merge(merge3, linkedin_data_jobs,
                      on=["company_industry","field","level","company_name","job_summary","job_id","job_title"],
                      how="outer")
    
    # Save the unified database
    merge4.to_csv(os.path.join(jobs_path, "all_jobpostings.csv"), index=False)
    print("Unified database created and saved.")
    
    return merge4

all_jobpostings = unify_jobpostings_datasets()
display(all_jobpostings)

You are creating a job interview simulator in Python. Before applying to an open position, an applicant could choose to use your simulator to practice on an interview for this job. This means the interview questions should be related to the job description and requirements. Also, the candidate should get a feedback on his performances in the simulation.

You have several datasets to train on, these are their schemas:
"all_code_problems_with_solutions.csv": 
question_id,content,difficulty,formatted_title,topic_tags,similar_questions,no_similar_questions,acceptance,solution_URL,solution.
"all_open_questions.csv" 
question_id,question,category,topics.

