In [1]:
import numpy as np
import pandas as pd
import nltk

#### Task 1

Застосувати приховане семантичне індексування бібліотеки scikit-learn для моделювання тем.
Вивести документи, що зробили найбільший вклад в теми.
Вивести документи, що описують кожну з тем.

In [2]:
data_df = pd.read_csv('news.csv')

In [3]:
import re

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


def preproc_doc(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = re.sub(r'http\S+', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc


preproc_corpus = np.vectorize(preproc_doc)
documents = preproc_corpus(data_df['text'])
documents

array(['thursdays biggest analyst calls apple amazon tesla palantir docusign exxon amp',
       'buy las vegas sands travel singapore builds wells fargo says',
       'piper sandler downgrades docusign sell citing elevated risks amid ceo transition',
       ..., 'russian sells bln roubles oneyear repo auction',
       'global esg bond issuance posts h dip supranationals cut back',
       'brazils petrobras says signed billion sustainability loan'],
      dtype='<U240')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(documents)

In [5]:
num_topics = 10

lsa_model = TruncatedSVD(n_components=num_topics)
lsa_matrix = lsa_model.fit_transform(matrix)
top_documents = lsa_matrix.argmax(axis=0)

for topic_index, document_index in enumerate(top_documents):
    print(f"Topic {topic_index}: {documents[document_index]}")

Topic 0: webcast second quarter financial results conference call
Topic 1: dividend growth stocks july stocks finance markets
Topic 2: wires wires
Topic 3: autonation inc q results earnings call presentation investing stocks stockmarket
Topic 4: c citigroup gaap eps beats revenue b beats b
Topic 5: labcorp declares quarterly dividend
Topic 6: first trust global funds plc uk regulatory announcement net asset values
Topic 7: announces date second quarter earnings release conference call
Topic 8: us cpi mom jun actual vs previous est us cpi yoy jun actual vs previous est us core cpi mom jun actual vs previous est us core cpi yoy jun actual vs previous est
Topic 9: company sues elon musk attempting back billion deal


In [6]:
num_words = 5
topics = lsa_model.components_

for topic_index, topic in enumerate(topics):
    top_words_indices = topic.argsort()[:-num_words - 1:-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_indices]
    print(f"Topic {topic_index}: {' '.join(top_words)}")

Topic 0: quarter second results earnings call
Topic 1: stocks markets economy us business
Topic 2: wires year firm investors revenue
Topic 3: investing stockmarket business stocks trading
Topic 4: beats eps revenue gaap revs
Topic 5: dividend declares quarterly share distribution
Topic 6: uk announcement regulatory plc form
Topic 7: earnings call announces conference est
Topic 8: est vs jun prev actual
Topic 9: new musk elon company billion


#### Task 2

Використати текст chesterton-thursday.txt з корпусу gutenberg бібліотеки nltk та вивести ключові біграми.

In [7]:
from nltk.corpus import gutenberg
from string import punctuation

chesterton = gutenberg.sents('chesterton-thursday.txt')
remove_terms = punctuation + '0123456789'
thursday = [[word.lower() for word in sent if word not in remove_terms] for sent in chesterton]
thursday = [' '.join(tok_sent) for tok_sent in thursday]
thursday = filter(None, preproc_corpus(thursday))
thursday = [tok_sent for tok_sent in thursday if len(tok_sent.split()) > 2]
thursday

['man thursday g k chesterton',
 'edmund clerihew bentley',
 'cloud mind men wailing went weather yea sick cloud upon soul boys together',
 'science announced nonentity art admired decay world old ended gay round us antic order crippled vices came lust lost laughter fear lost shame',
 'like white lock whistler lit aimless gloom men showed white feather proudly plume',
 'life fly faded death drone stung world old indeed young',
 'twisted even decent sin shapes named men ashamed honour ashamed',
 'weak foolish thus failed thus black baal blocked heavens hymns us children forts sand even weak eve high went piled break bitter sea',
 'fools motley jangling absurd church bells silent cap beds heard',
 'unhelped held fort tiny flags unfurled giants laboured cloud lift world',
 'find book found feel hour flings far fish shaped paumanok cry cleaner things green carnation withered forest fires pass roared wind world ten million leaves grass sane sweet sudden bird sings rain truth tusitala spoke 

In [8]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

bigrams_num = 10

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents([item.split() for item in thursday])
finder.nbest(bigram_measures.pmi, bigrams_num)

[('abandoned', 'boardroom'),
 ('absinth', 'cocoa'),
 ('acceptance', 'decisive'),
 ('accepting', 'vein'),
 ('accidental', 'dilemmas'),
 ('acclaim', 'virtues'),
 ('accuses', 'hypocrisy'),
 ('acquainted', 'secretarial'),
 ('additional', 'twist'),
 ('admiral', 'biffin')]