In [1]:
import pandas as pd
import re
import string
import nltk

from Keywords_Extractor_Factory import keyword_extractor_factory, stop_words, lemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cagatay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cagatay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cagatay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
data = pd.read_csv('src/datas/us_elections_news.csv')
data = data.drop_duplicates(subset=['date', 'title', 'text'])

In [3]:
def preprocess_news(text: str) -> str:
    text = text.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Temizlenmiş metni birleştir
    return ' '.join(words)

In [4]:
data 

Unnamed: 0,date,key_word,title,text,description,publisher,url
0,"Tue, 19 Nov 2024 08:00:00 GMT",US elections,U.S. Elections Analysis 2024: Key Outcomes & I...,"Last updated Nov. 19, 2024.\n\nExecutive Summa...",U.S. Elections Analysis 2024: Key Outcomes & I...,National Association of Counties,https://www.naco.org/resource/us-elections-ana...
1,"Sat, 09 Nov 2024 08:00:00 GMT",US elections,I spent hours trying to persuade US voters to ...,It has been an extraordinary week for US polit...,I spent hours trying to persuade US voters to ...,The Guardian,https://www.theguardian.com/commentisfree/2024...
2,"Wed, 06 Nov 2024 08:00:00 GMT",US elections,US elections showed resilience of democratic i...,"WASHINGTON D.C., 6 November 2024 – Engagement ...",US elections showed resilience of democratic i...,OSCE,https://www.osce.org/odihr/elections/usa/580111
3,"Wed, 30 Oct 2024 07:00:00 GMT",US elections,"The US elections: campaign, transition and ina...",Theme\n\nWhich are the various elements –beyon...,"The US elections: campaign, transition and ina...",Real Instituto Elcano,https://www.realinstitutoelcano.org/en/analyse...
4,"Tue, 05 Nov 2024 08:00:00 GMT",US elections,Donald Trump wins 2024 US election in historic...,Donald Trump sweeps to victory\n\nBrandon Live...,Donald Trump wins 2024 US election in historic...,BBC.com,https://www.bbc.com/news/live/czxrnw5qrprt
...,...,...,...,...,...,...,...
231,"Fri, 25 Oct 2024 07:00:00 GMT",Kemala Harris,Can Liz Cheney help Kamala Harris election cam...,Polls suggest some Republican voters are willi...,Can Liz Cheney help Kamala Harris election cam...,Al Jazeera English,https://www.aljazeera.com/news/2024/10/25/can-...
232,"Fri, 01 Nov 2024 07:00:00 GMT",Kemala Harris,Kamala Harris leads Donald Trump narrowly in Y...,The third and final release of our model estim...,Kamala Harris leads Donald Trump narrowly in Y...,YouGov US,https://today.yougov.com/politics/articles/508...
233,"Thu, 31 Oct 2024 07:00:00 GMT",Kemala Harris,Why I’m Voting for Kamala Harris - Bloomberg,I do not agree with Vice President Kamala Harr...,Why I’m Voting for Kamala Harris Bloomberg,Bloomberg,https://www.bloomberg.com/opinion/articles/202...
234,"Fri, 01 Nov 2024 07:00:00 GMT",Kemala Harris,Iowa Poll: Kamala Harris leapfrogs Donald Trum...,Democrat Kamala Harris leads Donald Trump in I...,Iowa Poll: Kamala Harris leapfrogs Donald Trum...,Des Moines Register,https://www.desmoinesregister.com/story/news/p...


In [5]:
data['wc'] = data['text'].apply(lambda x: len(x.split()))

In [6]:
sum(data['wc'])/len(data)

1078.339207048458

In [7]:
news = data['text'].to_list()

In [8]:
preprocessed_news = []
for new in news:
    preprocessed_news.append(preprocess_news(new))

In [9]:
tf_idf_results_raw = keyword_extractor_factory("TF-IDF", news, 10)
print(sorted(tf_idf_results_raw))

['biden', 'campaign', 'election', 'harris', 'people', 'president', 'said', 'states', 'trump', 'voters']


In [10]:
tf_idf_results = keyword_extractor_factory("TF-IDF", preprocessed_news, 10)
print(sorted(tf_idf_results))

['campaign', 'election', 'harris', 'president', 'said', 'state', 'tax', 'trump', 'vote', 'voter']


In [11]:
word2vec_results_raw = keyword_extractor_factory("Word2Vec", news, 10)
print(sorted(word2vec_results_raw))

['biden', 'campaign', 'election', 'harris', 'president', 'said', 'states', 'trump', 'us', 'would']


In [12]:
word2vec_results = keyword_extractor_factory("Word2Vec", preprocessed_news, 10)
print(sorted(word2vec_results))

['campaign', 'election', 'harris', 'president', 'said', 'state', 'trump', 'u', 'vote', 'would']


In [13]:
bert_results_200 = keyword_extractor_factory("BERT", news, 10, 200)
print(sorted(bert_results_200))

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


['beautifully', 'consult', 'dummy', 'eldest', 'explore', 'friendships', 'harvey', 'meyer', 'railing', 'sample']


In [14]:
bert_results_400 = keyword_extractor_factory("BERT", news, 10, 400)
print(sorted(bert_results_400))

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


['alternatively', 'bleed', 'cardiac', 'concentrations', 'dread', 'fearless', 'nodded', 'ousted', 'regulating', 'witness']


In [15]:
bert_results_500 = keyword_extractor_factory("BERT", news, 10, 500)
print(sorted(bert_results_500))

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


['breeding', 'defiance', 'derrick', 'energies', 'erected', 'fearless', 'halfway', 'reacted', 'scowl', 'translators']


In [16]:
set_tf_idf = set(tf_idf_results)
set_word2vec = set(word2vec_results)

set_bert_results_200 = set(bert_results_200)
set_bert_results_400 = set(bert_results_400)
set_bert_results_500 = set(bert_results_500)

In [17]:
print("BERT Intersection :")
set_bert_intersection = set_bert_results_200 & set_bert_results_400 & set_bert_results_500
print(set_bert_intersection)

BERT Intersection :
set()


In [18]:
print("BERT Union :")
set_bert_intersection = set_bert_results_200 | set_bert_results_400 | set_bert_results_500
print(set_bert_intersection)

BERT Union :
{'meyer', 'defiance', 'eldest', 'ousted', 'fearless', 'erected', 'harvey', 'friendships', 'reacted', 'halfway', 'dread', 'explore', 'dummy', 'cardiac', 'regulating', 'derrick', 'beautifully', 'alternatively', 'energies', 'witness', 'concentrations', 'scowl', 'nodded', 'breeding', 'bleed', 'translators', 'sample', 'railing', 'consult'}
