In [3]:
!pip install pandas
!pip install bertopic
!pip install nltk
!pip install cohere
!pip install tiktoken
!pip install tweet-preprocessor




In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
from nltk.corpus import stopwords
import preprocessor as p

p.set_options(p.OPT.URL)

standard_stop_words = stopwords.words('english')

def clean_text(text):
    text = p.clean(text)

    text = re.sub(r'\d+', '', text)

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in standard_stop_words]

    return ' '.join(filtered_words)

vectorizer_model = TfidfVectorizer(stop_words=standard_stop_words, ngram_range=(1, 3))

model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, calculate_probabilities=False, nr_topics="auto")

path = "C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_English/"
all_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith('.csv')]

for file in all_files:
    df = pd.read_csv(file ,sep=';', header=None)
    df = df.dropna(subset=[1])
    df[1] = df[1].apply(clean_text)

    documents = df[1].tolist()
    topics, _ = model.fit_transform(documents)

    df['Topic'] = topics
    topic_counts = df['Topic'].value_counts()
    print(f"\nTopics in file: {file}")
    print(topic_counts)

    topic_info = model.get_topic_info()
    for index, row in topic_info.iterrows():
        if row['Topic'] != -1:
            words = model.get_topic(row['Topic'])
            df["words"]=str([word for word, _ in words])
            print(f"Topic {row['Topic']}: {len(words)} words, representative words: {[word for word, _ in words]}")
    
            

    new_file_name = os.path.basename(file)[:-4] + '_real_topics.csv'
    now_folder_path = "C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Topic_English/"
    output_file_path = os.path.join(now_folder_path, new_file_name)
    #df.to_csv(new_file_path, sep=";", index=False)
    #print(f"Processed file saved: {new_file_path}")
    topic_info.to_csv(output_file_path, sep=";", index=False)



Topics in file: C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_English/Content_Aware_t=1_P=1_Instagram_English.csv
Topic
-1     296
 0     113
 1      99
 2      77
 3      56
 4      42
 5      40
 6      38
 7      38
 8      35
 9      28
 10     24
 11     18
 12     16
 13     16
 14     14
 15     13
 16     12
Name: count, dtype: int64
Topic 0: 10 words, representative words: ['smile', 'hey', 'hi', 'service', 'girls girls', 'girls', 'time', 'love', 'smiling', 'want go']
Topic 1: 10 words, representative words: ['workout', 'gym', 'sets', 'bodybuilding', 'reps', 'gains', 'fitness', 'training', 'beastmode', 'bodybuilding motivation']
Topic 2: 10 words, representative words: ['brow', 'lashes', 'anastasiabeverlyhills', 'eyeshadow', 'palette', 'anastasiabeverlyhills brow', 'wiz', 'eyeliner', 'brow wiz', 'liquid']
Topic 3: 10 words, representative words: ['happy', 'instagram', 'birthday', 'christmas', 'weekend', 'holiday', 'dad', 'jeremy', 'new

In [23]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,44,-1_procreate_draw_tutorials_brushes,"[procreate, draw, tutorials, brushes, flo, dra...",[Draw This!Anyone draw stylized landscape foll...
1,0,112,0_friendly_stop_stop mighty high_gearmy capture,"[friendly, stop, stop mighty high, gearmy capt...",[enjoyed video watch here: Friends:🦸 GEARMy Ca...
2,1,87,1_starcraft_blizzard_entertainment_blizzard en...,"[starcraft, blizzard, entertainment, blizzard ...",[Zerg versus Zerg build order: game Zerg versu...
3,2,84,2_minecraft_commons_macleod incompetech_incomp...,"[minecraft, commons, macleod incompetech, inco...",[Racing RAINBOW FRIENDS CARS! @Vynster 🛒 NEW M...
4,3,82,3_zero dawn_dawn_zero_fortnite,"[zero dawn, dawn, zero, fortnite, horizon, hor...",[Like stream? Subscribe now: (Thumbnail TaeHoo...
5,4,62,4_roblox_sure_guys_sure leave,"[roblox, sure, guys, sure leave, defild, even,...",[GOT SHINY MIMIKYU POKEMON BRICK BRONZE!? *Hal...
6,5,52,5_izecold_link track_ncstrack_ncstrack title,"[izecold, link track, ncstrack, ncstrack title...",[Today Make Portal Gun Escape Prison Virtual R...
7,6,50,6_waterslide_park_water_water park,"[waterslide, park, water, water park, watersli...",[trapdoor waterslide named Turbo Rocket top at...
8,7,42,7_makeup_makeup tutorials_eye makeup_eye makeu...,"[makeup, makeup tutorials, eye makeup, eye mak...",[♡ SUBSCRIBE weekly video's → Cuties! ♡ look h...
9,8,40,8_vr_nathie_oculus_virtual reality,"[vr, nathie, oculus, virtual reality, virtual,...",[Let's play Battlefield Virtual Reality using ...
