In [57]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

# LeetCode Dataset Extraction

In [2]:
from leetscrape import GetQuestionsList

print("Starting LeetCode problem list scraping...")

try:
    # 1. Initialize the correct class from the documentation
    ls = GetQuestionsList()
    
    # 2. Scrape the list of all questions
    ls.scrape()
    print("Scraping complete.")
    
    # 3. Access the DataFrame (as shown in docs: ls.questions)
    df = ls.questions 
    
    if df is not None and not df.empty:
        # 4. Define the columns you want
        # Mapping your request to the column names in the docs:
        # Problem ID     -> QID
        # Question Name  -> title  <-- ADDED THIS
        # Difficulty     -> difficulty
        # Category       -> categorySlug
        # Problem Tags   -> topicTags
        requested_columns = [
            'QID',
            'title', # <-- This is the column you asked for
            'difficulty',
            'categorySlug',
            'topicTags'
        ]
        
        # 5. Filter the DataFrame
        available_columns = [col for col in requested_columns if col in df.columns]
        filtered_df = df[available_columns]
        
        # 6. Display the first 5 rows of your filtered data
        print("\n--- Filtered LeetCode Problem Data (First 5 Rows) ---")
        print(filtered_df.head())
        
        # 7. (Optional) Save just this filtered data to a new CSV
        filtered_csv_path = "leetcode_filtered_problems.csv"
        filtered_df.to_csv(filtered_csv_path, index=False)
        print(f"\nFiltered data saved to '{filtered_csv_path}'")
        
    else:
        print("Error: The DataFrame 'ls.questions' is empty or was not created.")

except ImportError as e:
    print(f"Import Error: {e}")
    print("Please ensure 'leetscrape' and 'pandas' are installed correctly.")
except Exception as e:
    print(f"An error occurred: {e}")

Starting LeetCode problem list scraping...
Scraping companies ... Done
Scraping questions list ... Done
Extracting question topics ... Done
Getting Categories ... Done
Scraping Topic Tags ... Done
Extracting question category ... Done
Scraping complete.

--- Filtered LeetCode Problem Data (First 5 Rows) ---
  QID                                           title difficulty categorySlug   
0   1                                         Two Sum       Easy   algorithms  \
1   2                                 Add Two Numbers     Medium   algorithms   
2   3  Longest Substring Without Repeating Characters     Medium   algorithms   
3   4                     Median of Two Sorted Arrays       Hard   algorithms   
4   5                   Longest Palindromic Substring     Medium   algorithms   

                                 topicTags  
0                         array,hash-table  
1               linked-list,math,recursion  
2         hash-table,string,sliding-window  
3   array,binary-search,

In [None]:
import pandas as pd

lc = pd.read_csv("leetcode_filtered_problems.csv")
lc

Unnamed: 0,QID,title,difficulty,categorySlug,topicTags
0,1,Two Sum,Easy,algorithms,"array,hash-table"
1,2,Add Two Numbers,Medium,algorithms,"linked-list,math,recursion"
2,3,Longest Substring Without Repeating Characters,Medium,algorithms,"hash-table,string,sliding-window"
3,4,Median of Two Sorted Arrays,Hard,algorithms,"array,binary-search,divide-and-conquer"
4,5,Longest Palindromic Substring,Medium,algorithms,"two-pointers,string,dynamic-programming"
...,...,...,...,...,...
3730,3731,Find Missing Elements,Easy,algorithms,"array,hash-table,sorting"
3731,3732,Maximum Product of Three Elements After One Re...,Medium,algorithms,"array,math,greedy,sorting"
3732,3733,Minimum Time to Complete All Deliveries,Medium,algorithms,"math,binary-search"
3733,3734,Lexicographically Smallest Palindromic Permuta...,Hard,algorithms,"two-pointers,string,enumeration"


In [21]:
lc['topicTags'].value_counts()

topicTags
database                                                        310
array,dynamic-programming                                        94
string                                                           62
array                                                            61
math                                                             49
                                                               ... 
array,math,dynamic-programming,greedy,sorting                     1
array,hash-table,string,sorting,counting                          1
linked-list,tree,depth-first-search,binary-tree                   1
array,stack,design                                                1
string,binary-search,rolling-hash,suffix-array,hash-function      1
Name: count, Length: 1278, dtype: int64

# CodeForces Dataset Extraction

In [39]:
import requests
import pandas as pd
import os

print("Starting the process: Fetching and Cleaning Codeforces Data...")

try:
    # 1. Make the API request
    url = "https://codeforces.com/api/problemset.problems"
    print("Fetching data from Codeforces API...")
    response = requests.get(url)
    response.raise_for_status()  # Raises an exception for bad responses
    
    data = response.json()
    
    if data['status'] == 'OK':
        # 2. Get the list of problems and problem statistics
        problems = data['result']['problems']
        problem_stats = data['result']['problemStatistics']
        
        # 3. Convert to pandas DataFrames for easy handling
        df_problems = pd.DataFrame(problems)
        df_stats = pd.DataFrame(problem_stats)
        
        # 4. Merge the two dataframes to combine info
        df_full = pd.merge(df_problems, df_stats, on=['contestId', 'index'])
        
        print(f"Successfully fetched {len(df_full)} total problems.")
        
        # 5. --- START CLEANING ---
        print("\n--- Cleaning Data ---")
        
        original_count = len(df_full)
        
        # Step 1: Drop rows with missing 'rating' (NaN values)
        # We use .dropna() which is the standard way to remove rows with NaN
        df_cleaned = df_full.dropna(subset=['rating'])
        ratings_removed_count = original_count - len(df_cleaned)
        print(f"Step 1: Removed {ratings_removed_count} problems with missing 'rating'.")

        # Step 2: Drop rows with empty 'tags'
        # The 'tags' column from the API is a list.
        # We check if the length of this list is 0.
        count_before_tags_clean = len(df_cleaned)
        df_cleaned = df_cleaned[df_cleaned['tags'].apply(len) > 0]
        tags_removed_count = count_before_tags_clean - len(df_cleaned)
        print(f"Step 2: Removed {tags_removed_count} problems with empty 'tags'.")
        
        # 6. --- END CLEANING ---
        
        print(f"\n--- Summary ---")
        print(f"Original problem count: {original_count}")
        print(f"Final cleaned problem count: {len(df_cleaned)}")
        
        # 7. Display the head of the cleaned data
        print("\n--- Head of Cleaned Data ---")
        columns_to_show = ['contestId', 'index', 'name', 'rating', 'tags', 'solvedCount']
        available_columns = [col for col in columns_to_show if col in df_cleaned.columns]
        print(df_cleaned[available_columns].head())
        
        # 8. Save the cleaned data to a new CSV
        cleaned_file_name = "codeforces_problems_cleaned.csv"
        df_cleaned[available_columns].to_csv(cleaned_file_name, index=False)
        print(f"\nCleaned data has been saved to '{cleaned_file_name}'")

    else:
        print(f"API Error: {data.get('comment', 'Unknown error')}")

except requests.exceptions.RequestException as e:
    print(f"An error occurred during the request: {e}")
except KeyError:
    print("Error: Could not parse the API response. 'result' or 'problems' key might be missing.")
except Exception as e:
    print(f"An error occurred: {e}")

Starting the process: Fetching and Cleaning Codeforces Data...
Fetching data from Codeforces API...
Successfully fetched 10770 total problems.

--- Cleaning Data ---
Step 1: Removed 285 problems with missing 'rating'.
Step 2: Removed 159 problems with empty 'tags'.

--- Summary ---
Original problem count: 10770
Final cleaned problem count: 10326

--- Head of Cleaned Data ---
    contestId index                   name  rating   
11       2162     H      Beautiful Problem  2900.0  \
12       2162     G         Beautiful Tree  2200.0   
13       2162     F    Beautiful Intervals  2100.0   
14       2162     E  Beautiful Palindromes  1600.0   
15       2162     D  Beautiful Permutation  1400.0   

                                                 tags  solvedCount  
11                                               [dp]          185  
12  [constructive algorithms, math, probabilities,...         2055  
13                  [constructive algorithms, greedy]         2130  
14       [constructiv