In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
import pandas as pd
df = pd.read_csv("drive/MyDrive/Recommendation System/devpost_data_update4.csv")

In [3]:
import requests

# URL of the stop words file
url = 'https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Split the content by commas to get the stopwords
    stopwords = response.text.split(',')
else:
    print(f"Failed to download the file. Status code: {response.status_code}")
    stopwords = []
stop_words=[i.replace('"',"").strip() for i in stopwords]

In [4]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

def is_english_word(word):
  return bool(wordnet.synsets(word.lower()))

lemma = nltk.wordnet.WordNetLemmatizer()

word_tokenize_nopunct = RegexpTokenizer(r'\w+').tokenize
clean_description = []

for d in df['description']:
  word_tokens = word_tokenize_nopunct(d)
  filtered_sentence = []
  for w in word_tokens:
    if w.lower() not in stop_words and is_english_word(w.lower()):
      filtered_sentence.append(lemma.lemmatize(w.lower()))
  clean_description.append(' '.join(filtered_sentence))

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
doc_vec = vectorizer.fit_transform(clean_description)
dtm = pd.DataFrame(doc_vec.toarray(), columns=vectorizer.get_feature_names_out())

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim = cosine_similarity(doc_vec)
cos_sim_data = pd.DataFrame(cosine_sim)

In [11]:
# Test the recommended results for each project
index_recomm =cos_sim_data.loc[0].sort_values(ascending=False).index.tolist()[1:6]
projects_recomm =  df['title'].loc[index_recomm].values
result = {'Project': df['title'].loc[0],'Projects Recommended':projects_recomm,'Index':index_recomm}
print(result)

{'Project': 'Google’s Immersive Geospatial Challenge', 'Projects Recommended': array(['ARCore Geospatial API Challenge',
       'Google Maps Platform Hackathon',
       'GDSC Minerva DevFest on Campus | Hackathon',
       'Inspire2Dev Summer Hackathon 2022',
       'Google DSC PSBA - Ideathon 2021'], dtype=object), 'Index': [1489, 1420, 87, 1384, 3795]}


In [8]:
# Initialize the 'similar_projects' column if it doesn't exist
if 'similar_projects' not in df.columns:
  df['similar_projects'] = None
for index, item in cos_sim_data.iterrows():
  index_recomm = item.sort_values(ascending=False).index.tolist()
  similar_items = []

  for idx in index_recomm:
      if len(similar_items) == 5:
          break
      if index != idx and df['open_state'].loc[idx] != "closed":
          similar_items.append(idx)
  df.at[index, 'similar_projects'] = str(similar_items)

In [9]:
# Save to new cvs
df.to_csv('drive/MyDrive/Recommendation System/devpost_data_update5.csv', index=False)