In [0]:
import os
curr_dir = os.getcwd()
data_path = os.path.join(curr_dir, "data/")
questions_data_path = os.path.join(data_path, "questions_and_answers/")

In [0]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_solution_url(problem_number):
    """
    Determine the GitHub solution file URL based on the problem number.
    """
    base_url = "https://github.com/fishercoder1534/Leetcode/blob/master/src/main/java/com/fishercoder/solutions/"
    
    # Determine the folder
    if problem_number < 1000:
        folder = "firstthousand"
    elif problem_number < 2000:
        folder = "secondthousand"
    elif problem_number < 3000:
        folder = "thirdthousand"
    else:
        folder = "fourththousand"
    
    # Construct the URL
    return f"{base_url}{folder}/_{problem_number}.java"


def fetch_solution(url):
    """
    Fetch the raw content of the solution from the GitHub file URL.
    """
    try:
        # Convert the GitHub URL to the raw content URL
        raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
        
        # Fetch the solution content
        response = requests.get(raw_url)
        if response.status_code == 200:
            return response.text
        else:
            # Failed to fetch solution - ignore
            return None
    except Exception as e:
        print(f"Error fetching solution from {url}: {e}")
        return None


def process_leetcode_solutions():
    # Load the LeetCode problems dataset
    leetcode_problem_content = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems_data.csv")) \
        .drop(columns=["title", "likes", "dislikes"]) \
        .rename(columns={"slug": "formatted_title"}) \
        .assign(formatted_title=lambda df: df["formatted_title"].apply(lambda x: x.lower()))  # TODO: Need to remove html tags from Content column
    # Original columns: question_id,title,content,difficulty,likes,dislikes,slug.
    # New columns: question_id,formatted_title,content,difficulty.

    leetcode_problem_meta = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems_metadata.csv")) \
        .drop(columns=["page_number", "is_premium", "title", "accepted", "submission", "solution", "discussion_count", "likes", "dislikes"]) \
        .assign(problem_URL=lambda df: df["problem_URL"].apply(lambda x: x.split("/")[-1])) \
        .rename(columns={"id": "question_id", "problem_description": "content", "problem_URL": "formatted_title"})  # TODO: May need to remove prefix numbers from the Title column.
    # Original columns: id,page_number,is_premium,title,problem_description,topic_tags,difficulty,similar_questions,no_similar_questions,acceptance,accepted,submission,solution,discussion_count,likes,dislikes,problem_URL,solution_URL.
    # New columns: question_id,content,topic_tags,difficulty,similar_questions,no_similar_questions,acceptance, formatted_title,solution_URL.

    leetcode_links = pd.read_csv(os.path.join(questions_data_path, "leetcode_problems&solutions_links.csv")) \
        .drop(columns=["name"]) \
        .assign(problem_URL=lambda df: df["link"].apply(lambda x: x.split("/")[-2])) \
        .rename(columns={"link": "formatted_title", "solution": "solution_URL"})
    # Original columns: name,link,difficulty,solution.
    # New columns: formatted_title,difficulty,solution_URL.
    
    # Merge datasets
    leetcode_combined = pd.merge(leetcode_problem_content, leetcode_problem_meta, on=["question_id", "formatted_title", "content", "difficulty"], how="outer")
    
    # Add a column for the solution
    leetcode_combined["solution"] = None
    
    # Fetch solutions
    for index, row in leetcode_combined.iterrows():
        problem_id = row["question_id"]
        if not pd.isna(problem_id):
            solution_url = get_solution_url(int(problem_id))
            solution_content = fetch_solution(solution_url)
            leetcode_combined.at[index, "solution"] = solution_content
    
    # Save the updated dataset
    leetcode_combined.to_csv(os.path.join(questions_data_path, "leetcode_problems_with_solutions.csv"), index=False)
    print("Updated LeetCode dataset saved with solutions included.")
    
    return leetcode_combined

# Run the function
leetcode_with_solutions = process_leetcode_solutions()
leetcode_with_solutions.display()

In [0]:
import pandas as pd
import glob

def load_and_preprocess_datasets():
    """ Load datasets """
    # LeetCode datasets
    leetcode_problem_content = pd.read_csv("leetcode_problems_data.csv")  # Need to remove html tags from Content column
    # Columns: question_id,title,content,difficulty,likes,dislikes,slug
    leetcode_meta = pd.read_csv("leetcode_problems_metadata.csv")
    # Columns: id,page_number,is_premium,title,problem_description,topic_tags,difficulty,similar_questions,no_similar_questions,acceptance,accepted,submission,solution,discussion_count,likes,dislikes,problem_URL,solution_URL
    leetcode_links = pd.read_csv("leetcode_problems&solutions_links.csv")
    # Columns: name,link,difficulty,solution
    
    # Open-ended question datasets
    data_science_questions = pd.read_csv("open_questions_data_science.csv")
    general_questions = pd.read_csv("general_open_questions.csv")
    
    # Job description datasets
    job_descriptions = pd.read_csv("job_descriptions_and_skills.csv")
    linkedin_jobs = pd.read_csv("linkedin_hightech_jobs.csv")
    indeed_jobs = pd.read_csv("indeed_jobs.csv")
    glassdoor_jobs = pd.read_csv("glassdoor_data_jobs_and_company_info.csv")
    linkedin_data_jobs = pd.read_csv("linkedin_data_jobs.csv")
    
    """ Preprocess LeetCode datasets """
    leetcode_combined = pd.merge(leetcode_data, leetcode_meta, on="question_id", how="left")
    leetcode_combined = leetcode_combined[["question_id", "title", "content", "difficulty", "topic"]]
    leetcode_combined["category"] = "Coding"
    leetcode_combined["skills"] = leetcode_combined["topic"]
    leetcode_combined["source"] = "LeetCode"
    
    # Step 3: Preprocess open-ended questions
    data_science_questions["category"] = "Data Science"
    data_science_questions["skills"] = "Data Science"
    data_science_questions["source"] = "OpenQuestions_DS"
    
    general_questions["category"] = "General"
    general_questions["skills"] = "General Skills"
    general_questions["source"] = "OpenQuestions_General"
    
    # Step 4: Combine into a unified database
    # Unify columns into the common schema
    unified_schema_columns = ["question id", "title", "content", "difficulty", "category", "skills", "source"]
    leetcode_final = leetcode_combined.rename(columns={
        "question id": "question id",
        "question title": "title",
        "content": "content",
        "difficulty": "difficulty",
        "topic": "skills"
    })[unified_schema_columns]
    
    data_science_final = data_science_questions.rename(columns={
        "Question": "content"
    })[["content", "category", "skills", "source"]]
    data_science_final["question id"] = None
    data_science_final["title"] = None
    data_science_final["difficulty"] = None
    
    general_final = general_questions.rename(columns={
        "Question": "content"
    })[["content", "category", "skills", "source"]]
    general_final["question id"] = None
    general_final["title"] = None
    general_final["difficulty"] = None
    
    # Step 5: Concatenate all into one database
    unified_db = pd.concat([leetcode_final, data_science_final, general_final], ignore_index=True)
    
    # Save the unified database
    unified_db.to_csv("unified_questions_database.csv", index=False)
    print("Unified database created and saved as 'unified_questions_database.csv'")
    
    return unified_db

unified_database = load_and_preprocess_datasets()