In [327]:
import json
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [328]:
data = []
for i in range(0, 9):
    url = "https://researchops.web.illinois.edu/?page=" + str(i)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')  # Parse HTML content of the page

        table_rows = soup.find_all('tr')  # Find all table rows if the data is in a table

        for row in table_rows:
            columns = row.find_all('td')
            row_data = [column.get_text(separator = " ").strip() for column in columns]
            if row_data:  # Avoid appending empty rows
                data.append(row_data)
research_opps_w_duplicates = pd.DataFrame(data, columns=["Description", "Research Area", "Opportunity Timing", "Deadline Date"])
research_opps_w_duplicates

Unnamed: 0,Description,Research Area,Opportunity Timing,Deadline Date
0,Naval Research Enterprise Intern Program (NREI...,"Medicine & Health, Natural Sciences, Science &...",Summer,11/1/24
1,Junior Fellows Summer Intern Program Through t...,Humanities & Arts,Summer,Anticipated 11/26/24
2,Research Internships in Science and Engineerin...,"Medicine & Health, Natural Sciences, Science &...",Summer,Anticipated 12/6/24
3,Watershed Management Research and Outreach Und...,"Agriculture & Food Sciences, Natural Sciences,...",Summer,Anticipated 12/8/24
4,Brain Research Institute Summer Undergraduate ...,Medicine & Health,Summer,Anticipated 12/14/24
...,...,...,...,...
204,MEMOs Internship Scheme The MEMOs Internship S...,Humanities & Arts,"Fall, Spring, Summer, Winter",Rolling
205,The Mentoring Undergraduates in Science and En...,Science & Technology,"Fall, Spring",Rolling
206,Northwestern University Minority Health and He...,Medicine & Health,Summer,Rolling
207,Undergraduate Complexity Research (UCR) REU U...,"Natural Sciences, Science & Technology, Social...",Summer,TBA


In [329]:
#data cleaning
research_opps = research_opps_w_duplicates.drop_duplicates(ignore_index=True)
research_opps.insert(0, "Id", research_opps.index, False)
research_opps

Unnamed: 0,Id,Description,Research Area,Opportunity Timing,Deadline Date
0,0,Naval Research Enterprise Intern Program (NREI...,"Medicine & Health, Natural Sciences, Science &...",Summer,11/1/24
1,1,Junior Fellows Summer Intern Program Through t...,Humanities & Arts,Summer,Anticipated 11/26/24
2,2,Research Internships in Science and Engineerin...,"Medicine & Health, Natural Sciences, Science &...",Summer,Anticipated 12/6/24
3,3,Watershed Management Research and Outreach Und...,"Agriculture & Food Sciences, Natural Sciences,...",Summer,Anticipated 12/8/24
4,4,Brain Research Institute Summer Undergraduate ...,Medicine & Health,Summer,Anticipated 12/14/24
...,...,...,...,...,...
192,192,MEMOs Internship Scheme The MEMOs Internship S...,Humanities & Arts,"Fall, Spring, Summer, Winter",Rolling
193,193,The Mentoring Undergraduates in Science and En...,Science & Technology,"Fall, Spring",Rolling
194,194,Northwestern University Minority Health and He...,Medicine & Health,Summer,Rolling
195,195,Undergraduate Complexity Research (UCR) REU U...,"Natural Sciences, Science & Technology, Social...",Summer,TBA


In [330]:
import pickle
import pandas as pd
import numpy as np
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.utils import simple_preprocess
from sklearn.feature_extraction import text
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

In [331]:
# make words to exclude from processing
def make_stop_words():
    global stop_words
    words = ['program', 'undergraduate', 'students', 'intern', 'experience', 'internship', \
        'university', 'college', 'research', 'opportunity', \
            'fellows', 'scholars', 'undergrad']
    stopwords = stop_words.union(set(words))
    
    my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)
    return my_stop_words

In [332]:
def preprocessor(text, my_stop_words):
    """uses gensim simple_preprocess and then removes stop words
    -> used in the tag_docs function
    """
    # Instantiate a LancasterStemmer object, use gensim simple_preprocess to tokenize/lowercase
    # and then removes stop words
    simple = simple_preprocess(text)
    result = [word for word in simple if not word in my_stop_words]
    return result

def stem_tag_docs(docs, my_stop_words):
    ls = LancasterStemmer()
    results = docs.apply(lambda r: TaggedDocument(words=preprocessor(r['Description'], my_stop_words), tags=[str(r['Id'])]), axis=1)
    return results.tolist()

In [333]:
my_stop_words = make_stop_words()

In [334]:
tagged_research_opps = stem_tag_docs(research_opps, my_stop_words)
tagged_research_opps

[TaggedDocument(words=['naval', 'enterprise', 'nreip', 'nreip', 'offers', 'week', 'summer', 'internships', 'stem', 'providing', 'notch', 'mentoring', 'hands', 'navy', 'lab'], tags=['0']),
 TaggedDocument(words=['junior', 'summer', 'summer', 'exposed', 'broad', 'spectrum', 'library', 'work', 'copyright', 'preservation', 'reference', 'access', 'information', 'technology', 'working', 'direction', 'library', 'curators', 'specialists', 'divisions', 'explore', 'digital', 'initiatives', 'increase', 'access', 'institution', 'unparalleled', 'collections', 'resources'], tags=['1']),
 TaggedDocument(words=['internships', 'science', 'engineering', 'matched', 'host', 'institute', 'according', 'area', 'biology', 'chemistry', 'physics', 'earth', 'sciences', 'engineering', 'closely', 'related', 'field', 'daad', 'provides', 'monthly', 'stipend', 'months', 'help', 'cover', 'living', 'expenses', 'host', 'universities', 'universities', 'applied', 'sciences', 'uas', 'institutes', 'provide', 'housing', 'ass

In [335]:
model = Doc2Vec(vector_size=50, min_count=1, epochs=40)
model.build_vocab(tagged_research_opps)
model.train(tagged_research_opps, total_examples=model.corpus_count, epochs=model.epochs)

In [336]:
vector = model.infer_vector(research_opps.iloc[0, 1].split())

In [337]:
vec = model['chemistry']
d2v_test = model.docvecs.most_similar([vec], topn=5000)
d2v_test[:10]

  d2v_test = model.docvecs.most_similar([vec], topn=5000)


[('173', 0.9987130165100098),
 ('186', 0.9985602498054504),
 ('112', 0.9982492327690125),
 ('5', 0.9982089996337891),
 ('100', 0.9981591701507568),
 ('18', 0.9981147646903992),
 ('25', 0.9980730414390564),
 ('63', 0.9980043768882751),
 ('152', 0.99795001745224),
 ('47', 0.997944712638855)]

In [338]:
research_opps.iloc[173,1]


'Research in Chemistry at WVU The Research in Chemistry at West Virginia University REU is accepting applications from undergraduates who 1) are majoring in chemistry or biochemistry (pre-majors also considered), 2) are citizens, nationals, or permanent residents of the U.S. or its territories, and 3) are interested in a graduate student aligned research and training experience. Selected participants research for 10-weeks on projects with the potential to benefit society, directly or indirectly, and that address fundamental questions in the fields of health care, forensics/criminology, energy, sustainability, and transportation.'