In [1]:
# Let's read the uploaded CSV file and take a look at its structure to determine if we can use it for building a recommender system.
import pandas as pd

# Load the CSV file
data = pd.read_csv('leetcode.csv')

# Display the first few rows of the dataset to understand its structure
data.head(), data.columns


(   id  page_number  is_premium  \
 0   1            1       False   
 1   2            1       False   
 2   3            1       False   
 3   4            1       False   
 4   5            1       False   
 
                                                title  \
 0                                         1. Two Sum   
 1                                 2. Add Two Numbers   
 2  3. Longest Substring Without Repeating Characters   
 3                     4. Median of Two Sorted Arrays   
 4                   5. Longest Palindromic Substring   
 
                                  problem_description  \
 0  Given an array of integers nums and an integer...   
 1  You are given two non-empty linked lists repre...   
 2  Given a string s, find the length of the longe...   
 3  Given two sorted arrays nums1 and nums2 of siz...   
 4  Given a string s, return the longest palindrom...   
 
                                        topic_tags difficulty  \
 0                           'Array

In [2]:
data.describe()

Unnamed: 0,id,page_number,no_similar_questions,acceptance,accepted,submission,solution,discussion_count,likes,dislikes
count,3000.0,3000.0,2160.0,3000.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0
mean,1500.5,30.5,1.448611,55.610367,247887.8,475256.5,1653.675926,27.246296,2774.90463,403.934259
std,866.169729,17.320989,1.810409,15.702667,537300.6,1118383.0,2515.823036,40.675622,4186.567949,986.693398
min,1.0,1.0,0.0,11.3,1200.0,2400.0,24.0,0.0,15.0,1.0
25%,750.75,15.75,0.0,44.0,23500.0,48975.0,286.0,6.0,456.0,44.0
50%,1500.5,30.5,1.0,55.0,66300.0,116900.0,706.0,12.0,1100.0,132.0
75%,2250.25,45.25,2.0,66.7,215150.0,385075.0,1900.0,33.0,3200.0,354.25
max,3000.0,60.0,21.0,94.2,11300000.0,22100000.0,26800.0,638.0,52700.0,17200.0


In [3]:
data.columns

Index(['id', 'page_number', 'is_premium', 'title', 'problem_description',
       'topic_tags', 'difficulty', 'similar_questions', 'no_similar_questions',
       'acceptance', 'accepted', 'submission', 'solution', 'discussion_count',
       'likes', 'dislikes', 'problem_URL', 'solution_URL'],
      dtype='object')

In [4]:
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Preprocess the topic_tags to remove quotes and spaces, and handle NaN values
# data['topic_tags'] = data['topic_tags'].fillna('').str.replace('[', '').str.replace(']', '').str.replace('\'', '').str.replace(' ', '')

# # Convert topic_tags into a count matrix
# vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
# tag_matrix = vectorizer.fit_transform(data['topic_tags'])

# # Calculate the cosine similarity between problems based on topic_tags
# cosine_sim = cosine_similarity(tag_matrix, tag_matrix)

# # Function to recommend problems based on topic tags
# def recommend_by_topic_tags(problem_id, cosine_sim, data, n_recommendations=3):
#     # Get the index of the problem that matches the problem_id
#     idx = data.index[data['id'] == problem_id].tolist()[0]
    
#     # Get similarity scores for all problems with the given problem
#     sim_scores = list(enumerate(cosine_sim[idx]))
    
#     # Sort problems based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
#     # Get the indices of the top n similar problems (excluding the first one which is the problem itself)
#     sim_scores = sim_scores[1:n_recommendations+1]
    
#     # Get the corresponding problem IDs and details
#     problem_indices = [i[0] for i in sim_scores]
#     recommended_problems = data.iloc[problem_indices][['title', 'difficulty', 'problem_URL']]
    
#     return recommended_problems

# # Example usage: Recommend problems similar to problem with id 1
# problem_id = 512
# recommended_problems = recommend_by_topic_tags(problem_id, cosine_sim, data)
# print("Recommended Problems:")
# print(recommended_problems)


In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the topic_tags to remove quotes and spaces, and handle NaN values
data['topic_tags'] = data['topic_tags'].fillna('').str.replace('[', '').str.replace(']', '').str.replace('\'', '').str.replace(' ', '')

# Choose the vectorization method (CountVectorizer or TfidfVectorizer)
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
# vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))  # Alternative vectorization

# Convert topic_tags into a vectorized matrix
tag_matrix = vectorizer.fit_transform(data['topic_tags'])

# Cosine Similarity Model
cosine_sim = cosine_similarity(tag_matrix, tag_matrix)

# Nearest Neighbors Model
nn_model = NearestNeighbors(n_neighbors=5, metric='cosine').fit(tag_matrix)

# K-Means Clustering Model
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(tag_matrix)
data['cluster'] = kmeans.labels_

# Function to recommend problems based on topic tags
def recommend_problems_by_tags(topic_tags_input, n_recommendations=5, method='cosine'):
    # Vectorize the input topic tags
    input_vector = vectorizer.transform([topic_tags_input])
    
    if method == 'cosine':
        # Calculate cosine similarity with all problems
        sim_scores = cosine_similarity(input_vector, tag_matrix).flatten()
        # Get the indices of the most similar problems
        problem_indices = sim_scores.argsort()[-n_recommendations:][::-1]
    elif method == 'knn':
        # Use Nearest Neighbors model to find similar problems
        distances, indices = nn_model.kneighbors(input_vector)
        problem_indices = indices.flatten()[:n_recommendations]
    elif method == 'kmeans':
        # Use K-Means clustering to find problems in the same cluster
        cluster_label = kmeans.predict(input_vector)[0]
        problem_indices = data[data['cluster'] == cluster_label].index[:n_recommendations]
    else:
        raise ValueError("Invalid method. Choose from 'cosine', 'knn', 'kmeans'.")
    
    # Return the recommended problems
    recommended_problems = data.iloc[problem_indices][['title', 'difficulty', 'problem_URL']]
    return recommended_problems

# Example usage: Recommend problems for a given set of topic tags
q=int(input('Enter ID'))
topic_tags_input = data['topic_tags'][q]  # Example input
recommended_problems = recommend_problems_by_tags(topic_tags_input, method='cosine')

print("Recommended Problems:")
print(recommended_problems)


  data['topic_tags'] = data['topic_tags'].fillna('').str.replace('[', '').str.replace(']', '').str.replace('\'', '').str.replace(' ', '')
  super()._check_params_vs_input(X, default_n_init=10)


Enter ID 1


Recommended Problems:
                                title difficulty  \
1                  2. Add Two Numbers     Medium   
205          206. Reverse Linked List       Easy   
59           60. Permutation Sequence       Hard   
49                      50. Pow(x, n)     Medium   
202  203. Remove Linked List Elements       Easy   

                                           problem_URL  
1        https://leetcode.com/problems/add-two-numbers  
205  https://leetcode.com/problems/reverse-linked-list  
59   https://leetcode.com/problems/permutation-sequ...  
49                https://leetcode.com/problems/powx-n  
202  https://leetcode.com/problems/remove-linked-li...  
