In [28]:
import pandas as pd
from gensim import corpora, models
from googlesearch import search
import re
import nltk
import chardet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import requests

In [4]:
# Lets check what encoding we have for our NSFdata files
NSF_csv_files = ['NSFdata/NSF_CCF.csv', 'NSFdata/NSF_CICI.csv', 'NSFdata/NSF_CSSI.csv', 'NSFdata/NSF_DIBBS.csv', 'NSFdata/NSF_MRI.csv', 'NSFdata/NSF_OAC.csv', 'NSFdata/NSF_SI2.csv']
for file in NSF_csv_files:
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        print(file, result)

NSFdata/NSF_CCF.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299916171744574, 'language': ''}
NSFdata/NSF_CICI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_CSSI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_DIBBS.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_MRI.csv {'encoding': 'ISO-8859-1', 'confidence': 0.7299962504897843, 'language': ''}
NSFdata/NSF_OAC.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
NSFdata/NSF_SI2.csv {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [15]:
nltk.download('stopwords')
nltk.download('wordnet')
# 1. Read the CSV file and load it into a DataFrame
data = pd.read_csv('NSFdata/NSF_DIBBS.csv', encoding='ISO-8859-1')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...


In [9]:
projects = data[["AwardNumber", "Title", "NSFOrganization", "PrincipalInvestigator", "PIEmailAddress", "Abstract"]]
print(projects["Abstract"])

0      This project would automate the creation and d...
1      The growing number of cyber attacks on the Int...
2      Uranium-series geochronology plays a critical ...
3      Cybersecurity has become a significant issue t...
4      CIF21 DIBBs: Conceptualization of the Social a...
                             ...                        
125    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
126    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
127    ABSTRACT<br/><br/>OPP-9907197    OPP-9907469  ...
128    Current general circulation models (GCMs) have...
129    ABSTRACT<br/><br/>OPP-9907197    OPP-9907469  ...
Name: Abstract, Length: 130, dtype: object


In [16]:
def preprocess_abstract(abstract):
    abstract = re.sub('<[^<]+?>', '', abstract)  # Remove HTML tags
    abstract = abstract.lower()  # Convert to lowercase
    abstract = re.sub(r'\W+', ' ', abstract)  # Remove special characters and numbers
    words = abstract.split()  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return words

In [17]:
# 3. Process the Abstract column
abstracts = projects["Abstract"].apply(preprocess_abstract)
print(abstracts)

0      [project, would, automate, creation, data, ana...
1      [growing, number, cyber, attack, internet, cri...
2      [uranium, series, geochronology, play, critica...
3      [cybersecurity, become, significant, issue, pr...
4      [cif21, dibbs, conceptualization, social, inno...
                             ...                        
125    [abstractopp, 9813312, opp, 9813442, opp, 9813...
126    [abstractopp, 9813312, opp, 9813442, opp, 9813...
127    [abstractopp, 9907197, opp, 9907469, opp, 9907...
128    [current, general, circulation, model, gcms, d...
129    [abstractopp, 9907197, opp, 9907469, opp, 9907...
Name: Abstract, Length: 130, dtype: object


In [20]:
# 4. Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(abstracts)
print(dictionary)
# 5. Corpus is a list of bags of words. Each bag-of-words is a list of tuples (term_id, term_frequency).
corpus = [dictionary.doc2bow(text) for text in abstracts]

# 6. Define the LDA model
lda_model = models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=15)

Dictionary<4137 unique tokens: ['000', '1440753', '24', '50', '500']...>


In [21]:
# Create a new colunm LDA_ABSTRACTS to be classified into keywords based on the LDA model
projects["LDA_abstract_keywords"] = projects["Abstract"].apply(lambda x: lda_model[dictionary.doc2bow(preprocess_abstract(x))])

In [23]:
print(projects["LDA_abstract_keywords"])

0       [(3, 0.45789737), (5, 0.5392573)]
1        [(4, 0.5909081), (5, 0.4042163)]
2                       [(0, 0.99528813)]
3      [(1, 0.9782481), (5, 0.017256107)]
4       [(1, 0.8115505), (5, 0.18670286)]
                      ...                
125                      [(6, 0.9942036)]
126                      [(6, 0.9942036)]
127                      [(7, 0.9928834)]
128     [(2, 0.9078556), (7, 0.08861986)]
129                     [(7, 0.99288344)]
Name: LDA_abstract_keywords, Length: 130, dtype: object
0      This project would automate the creation and d...
1      The growing number of cyber attacks on the Int...
2      Uranium-series geochronology plays a critical ...
3      Cybersecurity has become a significant issue t...
4      CIF21 DIBBs: Conceptualization of the Social a...
                             ...                        
125    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
126    ABSTRACT<br/><br/>OPP-9813312   OPP-9813442   ...
127    ABSTRACT<br/><br/>O

In [24]:
BING_API_KEY = "your_bing_search_api_key_here"
# 7. Defining Search for news articles and other online sources
def search_news(title):
    headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
    params = {
        "q": f'"{title}"',
        "count": 3,
        "offset": 0,
        "mkt": "en-US",
        "safesearch": "Moderate",
    }
    response = requests.get("https://api.cognitive.microsoft.com/bing/v7.0/search", headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    news_links = [result["url"] for result in search_results["webPages"]["value"] if "news" in result["url"] or "article" in result["url"]]
    return ', '.join(news_links)

In [27]:
projects["News"] = projects["Title"].apply(search_news)
print(projects["News"])

HTTPError: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3D%2522CIF21%252BDIBBs%253A%252BPD%253A%252BEnhancing%252Band%252BPersonalizing%252BEducational%252BResources%252Bthrough%252BTools%252Bfor%252BExperimentation%2522%26num%3D12%26hl%3Den%26start%3D0&hl=en&q=EgSBc8MrGNPPm6IGIjBbxxQ_lYzXivQv17H2nj_drNflwrGW5vyu_FAJKfnUP6DenYaYvl4cvJeIKQkqYUcyAXI

In [None]:
# 8. Create a new DataFrame with the desired columns
output = projects.rename(columns={"Title": "Project_title",
                                   "NSFOrganization": "Funding_agency",
                                   "AwardNumber": "Award_number",
                                   "PrincipalInvestigator": "PI_Name",
                                   "PIEmailAddress": "PI_contact",
                                   "Abstract": "Abstract"})