# Topic Modelling Notebook 

In [1]:
# Utilities
from joblib import Parallel, delayed
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from multiprocessing import Pool, cpu_count
import warnings 
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
tqdm.pandas(desc="progress bar")
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Import time packages 
import time
import datetime

# Visualization Tools
import matplotlib.pyplot as plt

# Packages needed for text pre-processing:
import nltk 
import spacy 
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Doc
from spacy.language import Language
import re
import contractions
import emoji
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Deep Learning Models 
import torch
import tensorflow as tf

# Sentiment Packages
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from scipy.special import softmax


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


2023-05-06 16:43:05.661544: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from utils import search_folder
current_dir = os.getcwd()
thesis_folder_path = current_dir.replace("master_thesis_coding", "")
twitter_data_path = search_folder(thesis_folder_path,"twitter_data")
eikon_data_path = search_folder(thesis_folder_path,"eikon_news")
stock_data_path = search_folder(thesis_folder_path,"stock_prices")
gc.collect()

83

In [3]:
import bertopic
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

In [4]:
fetch_20newsgroups

<function sklearn.datasets._twenty_newsgroups.fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False)>

In [None]:
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

topic_model = BERTopic()
topics, probs = topic_model.fit_transform("hello how are you, there is a pizza. Donal Trump has been reelected. Cristiano Ronaldo has just won the worldcup") 

In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,6561,-1_to_the_of_and
1,0,1828,0_game_team_games_he
2,1,511,1_key_clipper_chip_encryption
3,2,480,2_israel_israeli_jews_arab
4,3,462,3_ites_cheek_yep_huh
...,...,...,...
214,213,11,213_memory_shared_server_pixmaps
215,214,10,214_color_icon_colors_colormap
216,215,10,215_magi_zoroastrians_zoroastrian_osiris
217,216,10,216_license_export_rsa_pkp


In [7]:
topic_model.get_topic(0)

[('game', 0.010394468191049566),
 ('team', 0.009054687667607184),
 ('games', 0.007192687520488394),
 ('he', 0.0070492987960421045),
 ('players', 0.00635127993653445),
 ('season', 0.006245043651126465),
 ('hockey', 0.006145615807516175),
 ('play', 0.005801515111207944),
 ('25', 0.005665825277801316),
 ('year', 0.005628499130478387)]

In [8]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,\n\nI am sure some bashers of Pens fans are pr...,0,0_game_team_games_he,game - team - games - he - players - season - ...,1.000000,False
1,My brother is in the market for a high-perform...,4,4_card_monitor_video_drivers,card - monitor - video - drivers - vga - monit...,1.000000,False
2,\n\n\n\n\tFinally you said what you dream abou...,19,19_armenian_turkish_armenians_genocide,armenian - turkish - armenians - genocide - tu...,0.381696,False
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,5,5_drive_scsi_drives_ide,drive - scsi - drives - ide - disk - controlle...,1.000000,False
4,1) I have an old Jasmine drive which I cann...,99,99_tape_backup_tapes_drive,tape - backup - tapes - drive - wangdat - munr...,0.682120,False
...,...,...,...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,18,18_doctor_cancer_medical_patients,doctor - cancer - medical - patients - medicin...,1.000000,False
18842,\nNot in isolated ground recepticles (usually ...,175,175_ground_grounding_conductor_neutral,ground - grounding - conductor - neutral - wir...,0.780403,False
18843,I just installed a DX2-66 CPU in a clone mothe...,83,83_fan_cpu_heat_sink,fan - cpu - heat - sink - fans - cooling - chi...,0.979964,False
18844,\nWouldn't this require a hyper-sphere. In 3-...,22,22_den_polygon_points_algorithm,den - polygon - points - algorithm - xxxx - sp...,1.000000,False
