In [1]:
#topic modeling with lda
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
npo = pd.read_csv("NPOPlayer.csv", sep=";")

npo[npo["longSummary"].isna()] = ""

In [3]:
npo['thumbnail']

0         https://images.npo.nl/header/2560x1440/1369174...
1         https://images.npo.nl/header/2560x1440/752519.jpg
2                                                          
3                                                          
4         https://images.npo.nl/header/2560x1440/557420.jpg
                                ...                        
132008    https://images.npo.nl/header/2560x1440/890863.jpg
132009                                                     
132010    https://images.npo.nl/header/2560x1440/1210745...
132011                                                     
132012                                                  NaN
Name: thumbnail, Length: 132013, dtype: object

In [4]:
description = npo['longSummary']

In [5]:
print(description)

0         In de achtste aflevering van onze nieuwe dagel...
1         Van alle inschrijvingen zijn er slechts 28 ges...
2                                                          
3                                                          
4         Wat gebeurt er met de inwoners van een klein Z...
                                ...                        
132008    Van vernielingen en diefstal tot aanrandingen,...
132009                                                     
132010    Sjezen om je vliegtuig te halen en dan zulke t...
132011                                                     
132012    Spectaculaire beelden uit Zwolle waar dit week...
Name: longSummary, Length: 132013, dtype: object


In [6]:
#create function to clean the text
import spacy
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
#nlp = spacy.load("nl_core_news_sm")
nltk.download('stopwords')
nltk.download('wordnet')
sw = set(stopwords.words('dutch'))

import re
import string
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(x):
    # remove punctutation
    x = x.translate(str.maketrans('','',string.punctuation))
    #remove digits
    x = x.translate(str.maketrans('','',string.digits))
    # tokenize for stopwords
    x = word_tokenize(x.lower())
    x = [lemmatizer.lemmatize(token) for token in x if token not in sw]
    x = ' '.join([w for w in x])
    return x

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\evand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\evand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
description_cleaned = description.progress_apply(clean_text)

100%|██████████| 132013/132013 [00:47<00:00, 2795.61it/s]


In [None]:
print(description_cleaned)

0         achtste aflevering onze nieuwe dagelijkse seri...
1         alle inschrijvingen slechts geselecteerd audit...
2                                                          
3                                                          
4         gebeurt inwoners klein zeeuws dorp plotseling ...
                                ...                        
132008    vernielingen diefstal aanrandingen niemand vei...
132009                                                     
132010    sjezen vliegtuig halen zulke tsunami zweten ze...
132011                                                     
132012    spectaculaire beelden zwolle waar weekend twee...
Name: longSummary, Length: 132013, dtype: object


In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=40)
tf_idf = vectorizer.fit_transform(description_cleaned)
columns=vectorizer.get_feature_names_out()

In [15]:
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 10
lda_npo = LatentDirichletAllocation(n_components=n_topics, learning_method='online', n_jobs=24, verbose=1)
lda_npo.fit(tf_idf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [16]:
from joblib import dump, load
dump(lda_npo, 'lda_npo.joblib') 

['lda_npo.joblib']

In [21]:
lda_npo.transform(tf_idf)

array([[0.01464322, 0.01464322, 0.01464322, ..., 0.01464322, 0.01464965,
        0.01464412],
       [0.23977512, 0.0266702 , 0.0266702 , ..., 0.0266702 , 0.0266702 ,
        0.0266702 ],
       [0.1       , 0.1       , 0.1       , ..., 0.1       , 0.1       ,
        0.1       ],
       ...,
       [0.02375913, 0.02375913, 0.02375913, ..., 0.02375913, 0.02375913,
        0.02375913],
       [0.1       , 0.1       , 0.1       , ..., 0.1       , 0.1       ,
        0.1       ],
       [0.0244851 , 0.0244851 , 0.0244851 , ..., 0.0244851 , 0.0244851 ,
        0.0244851 ]])

In [25]:
terms = vectorizer.get_feature_names_out()
def df_lda(model,terms,n_topics):
    df = pd.DataFrame(model.components_, columns=terms)
    df = df.T
    for topic in range(n_topics):
        df_topic = df[topic]
        print('topic',topic,'top words are:\n',df[topic].sort_values(ascending=False)[:15])

In [26]:
print(df_lda(lda_npo, terms,n_topics))

topic 0 top words are:
 animatieserie    284.727280
kid              270.818997
online           123.786365
populairste      120.234795
weerreportage    112.247597
stijgers         107.386394
ieniemienie       94.643754
dier              80.619046
paradeplaat       74.548310
samson            70.581383
timmy             68.373913
eend              64.511986
grappige          62.487640
daagt             61.496344
geschikte         55.147630
Name: 0, dtype: float64
topic 1 top words are:
 zappie               1063.709460
knuffel               539.962252
vrolijk               439.924360
evenementen           327.789263
kleuterprogramma      327.120300
sportwedstrijden      313.858756
enof                  283.668532
hoep                  212.133756
hoela                 171.187528
expert                135.376070
quiz                  120.666432
kortste               115.382054
jackie                102.002067
tentoonstellingen      88.165372
jasper                 85.264516
Name: 1, dtyp

In [None]:
# topics
"""
1: culture
2: 
3: 
4: children programming
5: 
6: 
7:
8:
9: 
10: 
"""
