In [3]:
!pip install pandas
!pip install bertopic
!pip install nltk
!pip install cohere
!pip install tiktoken
!pip install tweet-preprocessor




In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
from nltk.corpus import stopwords
import preprocessor as p

p.set_options(p.OPT.URL)

standard_stop_words = stopwords.words('english')

def clean_text(text):
    text = p.clean(text)

    text = re.sub(r'\d+', '', text)

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in standard_stop_words]

    return ' '.join(filtered_words)

vectorizer_model = TfidfVectorizer(stop_words=standard_stop_words, ngram_range=(1, 3))

model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, calculate_probabilities=False, nr_topics="auto")

path = "C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/"
all_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith('.csv')]

for file in all_files:
    df = pd.read_csv(file ,sep=';', header=None)
    df = df.dropna(subset=[1])
    df[1] = df[1].apply(clean_text)

    documents = df[1].tolist()
    topics, _ = model.fit_transform(documents)

    df['Topic'] = topics
    topic_counts = df['Topic'].value_counts()
    print(f"\nTopics in file: {file}")
    print(topic_counts)

    topic_info = model.get_topic_info()
    for index, row in topic_info.iterrows():
        if row['Topic'] != -1:
            words = model.get_topic(row['Topic'])
            df["words"]=str([word for word, _ in words])
            print(f"Topic {row['Topic']}: {len(words)} words, representative words: {[word for word, _ in words]}")
    
            

    new_file_name = os.path.basename(file)[:-4] + '_real_topics.csv'
    now_folder_path = "C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Topic_Dutch/"
    output_file_path = os.path.join(now_folder_path, new_file_name)
    #df.to_csv(new_file_path, sep=";", index=False)
    #print(f"Processed file saved: {new_file_path}")
    topic_info.to_csv(output_file_path, sep=";", index=False)



Topics in file: C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch.csv
Topic
-1     296
 0     169
 1      71
 2      64
 3      38
 4      34
 5      27
 6      27
 7      26
 8      24
 9      21
 10     17
 11     16
 12     13
 13     12
Name: count, dtype: int64
Topic 0: 10 words, representative words: ['ik', 'foto', 'een', 'het', 'de', 'dat', 'en', 'je', 'voor', 'van']
Topic 1: 10 words, representative words: ['video', 'nieuwe video', 'spotify', 'nieuwe', 'ik', 'bio', 'de', 'mijn', 'op spotify', 'youtube']
Topic 2: 10 words, representative words: ['stuktv', 'teamstuk', 'stuktv teamstuk', 'stuktv stuktv', 'stuktv stuktv teamstuk', 'opdracht', 'de', 'het', 'morgen stuktv', 'morgen stuktv stuktv']
Topic 3: 10 words, representative words: ['mijn', 'de', 'zijn', 'oma', 'zonnatura', 'zijn mijn', 'opa', 'zou', 'post', 'mij']
Topic 4: 10 words, representative words: ['ik', 'en', 'een', 'de', 'dat', 'het', '

In [8]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,156,-1_je_en_ik_de,"[je, en, ik, de, video, supergaande, een, mijn...",[► Abonneer: Vorige Aflevering: Twitter: Insta...
1,0,97,0_ops_black ops_black_yarasky,"[ops, black ops, black, yarasky, warfare, ops ...",[Vandaag spelen wij opnieuw Fortnite: Battle R...
2,1,50,1_logitech_samenwerking deze vermelding_samenw...,"[logitech, samenwerking deze vermelding, samen...",[⭐ VOLG MIJ OP INSTAGRAM ► BEZOEK mijn WEBSHOP...
3,2,42,2_videos_beach_minecraft_mick,"[videos, beach, minecraft, mick, videos reacti...",[IDC/Games: : voor het kijken van deze video!A...
4,3,41,3_fifa_te veranderen_random shit_bankzitters,"[fifa, te veranderen, random shit, bankzitters...",[Koop hier je coins : : MickYo mensen van Play...
5,4,35,4_video van_video van mij_je op de_ik,"[video van, video van mij, je op de, ik, video...",[• Vorige aflevering: kleding: Volgende afleve...
6,5,35,5_je_mijn kanaal_leuk_mijn,"[je, mijn kanaal, leuk, mijn, op mijn kanaal, ...",[Yay weer een nieuwe video op mijn kanaal! Als...
7,6,32,6_royalistiq_royalistiq army_army_pc,"[royalistiq, royalistiq army, army, pc, gb, co...",[Meer 'Politie en boef' check de afspeellijst:...
8,7,25,7_hier mijn_naar deze_plaats alblasserdampostc...,"[hier mijn, naar deze, plaats alblasserdampost...",[♦ Bedankt voor het kijken naar deze video! Be...
9,8,25,8_één miljoen abonnees_maak filmpjes op_één mi...,"[één miljoen abonnees, maak filmpjes op, één m...",[KOOP HIER JE LINKTIJGER MERCH! ► ► ► ► ► ► pr...
