In [3]:
!pip install pandas
!pip install bertopic
!pip install nltk
!pip install cohere
!pip install tiktoken
!pip install tweet-preprocessor




In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\I6240624\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
from nltk.corpus import stopwords
import preprocessor as p

p.set_options(p.OPT.URL)

standard_stop_words = stopwords.words('english')

def clean_text(text):
    text = p.clean(text)

    text = re.sub(r'\d+', '', text)

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in standard_stop_words]

    return ' '.join(filtered_words)

vectorizer_model = TfidfVectorizer(stop_words=standard_stop_words, ngram_range=(1, 3))

model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, calculate_probabilities=False, nr_topics="auto")

path = "C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation/"
all_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith('.csv')]

for file in all_files:
    df = pd.read_csv(file ,sep=';', header=None)
    df = df.dropna(subset=[0])
    df[0] = df[0].apply(clean_text)

    documents = df[0].tolist()
    topics, _ = model.fit_transform(documents)

    df['Topic'] = topics
    topic_counts = df['Topic'].value_counts()
    print(f"\nTopics in file: {file}")
    print(topic_counts)

    topic_info = model.get_topic_info()
    for index, row in topic_info.iterrows():
        if row['Topic'] != -1:
            words = model.get_topic(row['Topic'])
            df["words"]=str([word for word, _ in words])
            print(f"Topic {row['Topic']}: {len(words)} words, representative words: {[word for word, _ in words]}")
    
            

    new_file_name = os.path.basename(file)[:-4] + '_with_topics.csv'
    new_file_path = os.path.join(path, new_file_name)
    df.to_csv(new_file_path, sep=";", index=False)
    print(f"Processed file saved: {new_file_path}")



Topics in file: C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation/Content_Aware_t=1_P=1_Instagram_Dutch.csv
Topic
-1     337
 0     109
 1      57
 2      56
 3      45
 4      38
 5      28
 6      25
 7      24
 8      22
 9      21
 10     21
 11     20
 12     15
 13     14
 14     13
 15     10
Name: count, dtype: int64
Topic 0: 10 words, representative words: ['het', 'ik', 'de', 'een', 'en', 'van', 'mijn', 'te', 'ze', 'aan']
Topic 1: 10 words, representative words: ['video', 'youtube', 'vlog', 'bellingatv com', 'familievloggers', 'de', 'link', 'mijn', 'op', 'bellingatv']
Topic 2: 10 words, representative words: ['recept', 'foodie', 'een', 'vegan', 'smoothie', 'heerlijke', 'de dag', 'met', 'met een', 'het recept']
Topic 3: 10 words, representative words: ['day', 'feeling', 'adventures', 'new', 'happy', 'love', 'amazing', 'together', 'days', 'winter']
Topic 4: 10 words, representative words: ['squad', 'met', 'squadgoals', 'feesten', 'quality t

In [13]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,337,-1_de_een_mijn_van,"[de, een, mijn, van, het, voor, en, met, ik, op]",[De dozen beginnen zich op te stapelen en het ...
1,0,109,0_het_ik_de_een,"[het, ik, de, een, en, van, mijn, te, ze, aan]",[OMG ik kan het haast niet geloven - hebben ee...
2,1,57,1_video_youtube_vlog_bellingatv com,"[video, youtube, vlog, bellingatv com, familie...",[Mijn broer en ik hebben iets grappigs uitgepr...
3,2,56,2_recept_foodie_een_vegan,"[recept, foodie, een, vegan, smoothie, heerlij...",[Zin iets hartigs? Probeer deze gehakttaart! P...
4,3,45,3_day_feeling_adventures_new,"[day, feeling, adventures, new, happy, love, a...",[Excited upcoming event @zaraofficial next Sat...
5,4,38,4_squad_met_squadgoals_feesten,"[squad, met, squadgoals, feesten, quality time...",[Quality time met oom Piet en onze neefjes het...
6,5,28,5_outfit_fashionista_fashion_giveaway,"[outfit, fashionista, fashion, giveaway, en, k...","[OMGGG, ik heb zo'n toffe giveaway voor jullie..."
7,6,25,6_workout_sportschool_de sportschool_de,"[workout, sportschool, de sportschool, de, spo...","[Wat denk jij, bij wie zal de geheimzinnige be..."
8,7,24,7_week_weekend_de_deze week,"[week, weekend, de, deze week, weekendvibes, t...",[Het weer zo'n productieve week! Ik heb eindel...
9,8,22,8_netflix_netflixandchill_serie_film,"[netflix, netflixandchill, serie, film, en net...",[Overal om heen zie ik mensen floreren en hier...
