In [2]:
#topic modeling with lda
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [1]:
min(1,0)

0

In [3]:
npo = pd.read_csv("NPOPlayer.csv", sep=";")

npo[npo["longSummary"].isna()] = ""

In [4]:
npo['thumbnail']

0         https://images.npo.nl/header/2560x1440/1369174...
1         https://images.npo.nl/header/2560x1440/752519.jpg
2                                                          
3                                                          
4         https://images.npo.nl/header/2560x1440/557420.jpg
                                ...                        
132008    https://images.npo.nl/header/2560x1440/890863.jpg
132009                                                     
132010    https://images.npo.nl/header/2560x1440/1210745...
132011                                                     
132012                                                  NaN
Name: thumbnail, Length: 132013, dtype: object

In [5]:
description = npo['longSummary']

In [6]:
print(description)

0         In de achtste aflevering van onze nieuwe dagel...
1         Van alle inschrijvingen zijn er slechts 28 ges...
2                                                          
3                                                          
4         Wat gebeurt er met de inwoners van een klein Z...
                                ...                        
132008    Van vernielingen en diefstal tot aanrandingen,...
132009                                                     
132010    Sjezen om je vliegtuig te halen en dan zulke t...
132011                                                     
132012    Spectaculaire beelden uit Zwolle waar dit week...
Name: longSummary, Length: 132013, dtype: object


In [7]:
#create function to clean the text
import spacy
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
#nlp = spacy.load("nl_core_news_sm")
nltk.download('stopwords')
nltk.download('wordnet')
sw = set(stopwords.words('dutch'))

import re
import string
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(x):
    # remove punctutation
    x = x.translate(str.maketrans('','',string.punctuation))
    #remove digits
    x = x.translate(str.maketrans('','',string.digits))
    # tokenize for stopwords
    x = word_tokenize(x.lower())
    x = [lemmatizer.lemmatize(token) for token in x if token not in sw]
    x = ' '.join([w for w in x])
    return x

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
description_cleaned = description.progress_apply(clean_text)

100%|████████████████████████████████████████████████████████████████████████| 132013/132013 [00:44<00:00, 2965.11it/s]


In [9]:
print(description_cleaned)

0         achtste aflevering onze nieuwe dagelijkse seri...
1         alle inschrijvingen slechts geselecteerd audit...
2                                                          
3                                                          
4         gebeurt inwoners klein zeeuws dorp plotseling ...
                                ...                        
132008    vernielingen diefstal aanrandingen niemand vei...
132009                                                     
132010    sjezen vliegtuig halen zulke tsunami zweten ze...
132011                                                     
132012    spectaculaire beelden zwolle waar weekend twee...
Name: longSummary, Length: 132013, dtype: object


In [10]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=25)
tf_idf = vectorizer.fit_transform(description_cleaned)
columns=vectorizer.get_feature_names_out()

In [11]:
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 15
lda_npo = LatentDirichletAllocation(n_components=n_topics, learning_method='online', n_jobs=16, verbose=1)
lda_npo.fit(tf_idf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [12]:
terms = columns
def df_lda(model,terms,n_topics):
    df = pd.DataFrame(model.components_, columns=terms)
    df = df.T
    print(df)
    for topic in range(n_topics):
        df_topic = df[topic]
        print('topic',topic,'top words are:\n',df[topic].sort_values(ascending=False)[:15])

In [13]:
print(df_lda(lda_npo, terms,n_topics))

                    0         1         2         3          4           5  \
aa           0.100000  0.100003  0.100000  0.100019   0.100040    9.129469   
aad          0.100006  0.100001  0.100000  0.100056   0.100045   11.970907   
aanbevolen   0.100000  0.100000  0.100000  0.100011   0.100062    0.100002   
aanbieden    0.100030  0.100003  0.100003  0.100079   0.100126    0.100068   
aanbod       0.100047  0.100002  0.100029  0.100011  21.084758    0.100041   
...               ...       ...       ...       ...        ...         ...   
één          0.100007  0.100046  0.100004  0.100011  15.684806  481.629337   
óf           0.100019  0.100032  0.100004  0.100034   0.100139    0.100119   
óók          0.100002  0.100001  0.100061  0.100008  14.761070    0.100097   
özcan       32.364754  0.100002  0.100003  0.100010   0.100011    0.100009   
überhaupt    0.100004  0.100001  0.100000  0.100022   0.100041   11.307490   

                   6           7         8         9  
aa      

In [14]:
# topics
"""
1: culture
2: 
3: 
4: children programming
5: 
6: 
7:
8:
9: 
10: 
"""


'\n1: culture\n2: \n3: \n4: children programming\n5: \n6: \n7:\n8:\n9: \n10: \n'

In [15]:
# find the most likely topic for each instance in the dataframe
topic_dist = lda_npo.transform(tf_idf)

clusters = topic_dist.argmax(axis=1)

data = {
  'description': description_cleaned,
  'topic': clusters
}
cluster_df = pd.DataFrame(data)

In [16]:
cluster_df['subtopic'] = None
print(cluster_df)

                                              description  topic subtopic
0       achtste aflevering onze nieuwe dagelijkse seri...      5     None
1       alle inschrijvingen slechts geselecteerd audit...      5     None
2                                                              0     None
3                                                              0     None
4       gebeurt inwoners klein zeeuws dorp plotseling ...      4     None
...                                                   ...    ...      ...
132008  vernielingen diefstal aanrandingen niemand vei...      4     None
132009                                                         0     None
132010  sjezen vliegtuig halen zulke tsunami zweten ze...      5     None
132011                                                         0     None
132012  spectaculaire beelden zwolle waar weekend twee...      5     None

[132013 rows x 3 columns]


In [17]:
data_df = {
    'topic': [-1,-1],
    'subtopic': [-1,-1],
    'words': [['letter','car'],['letter','car']]
}
words_subtopic = pd.DataFrame(data_df)

def sub_topic(cluster_df,max_sub_topic_amount):
    for i in range(n_topics):
        # get the right instances
        cluster = cluster_df[cluster_df['topic']==i]
        sub_topic_amount = min(len(cluster)/20,max_sub_topic_amount)
        description = cluster['description']
        # create tf_idf
        vectorizer = TfidfVectorizer(min_df=10)
        tf_idf = vectorizer.fit_transform(description)
        columns=vectorizer.get_feature_names_out()
        # make lda and the clusters
        lda = LatentDirichletAllocation(n_components=sub_topic_amount, learning_method='online', n_jobs=16, verbose=1)
        lda.fit(tf_idf)
        df_subtopic = pd.DataFrame(lda.components_, columns=columns)
        df_subtopic = df_subtopic.T
        for topic in range(sub_topic_amount):
            df_topic = df_subtopic[topic]
            most_words = df_topic.sort_values(ascending=False)[:10].index.tolist()
            entry = [i,topic,most_words]
            words_subtopic.loc[len(words_subtopic)] = entry
        topic_dist = lda.transform(tf_idf)
        clusters = topic_dist.argmax(axis=1)
        #input the cluster to the right topic cluster
        cluster_df['subtopic'][cluster_df['topic']==i] = clusters
        

In [18]:
sub_topic(cluster_df,10)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['subtopic'][cluster_df['topic']==i] = clusters


In [19]:
print(words_subtopic)

     topic  subtopic                                              words
0       -1        -1                                      [letter, car]
1       -1        -1                                      [letter, car]
2        0         0  [kleianimatieserie, kleine, pinguïn, pingu, co...
3        0         1  [pingu, pinguïn, kleine, kleianimatieserie, ho...
4        0         2  [compilatie, fragmenten, hoogtepunten, afgelop...
..     ...       ...                                                ...
97       9         5  [beam, joram, uitzending, aansluitend, kaat, o...
98       9         6  [liveverslag, songfestival, junior, finale, ze...
99       9         7  [dag, interview, belangrijkste, sportwedstrijd...
100      9         8  [latenight, talkshow, muziek, parijs, open, fr...
101      9         9  [nieuwskwartier, clip, nieuwe, sterrennl, hit,...

[102 rows x 3 columns]


In [25]:
words_subtopic.to_csv('topic_words.csv')

In [20]:
print(cluster_df)

                                              description  topic subtopic
0       achtste aflevering onze nieuwe dagelijkse seri...      5        9
1       alle inschrijvingen slechts geselecteerd audit...      5        9
2                                                              0        0
3                                                              0        0
4       gebeurt inwoners klein zeeuws dorp plotseling ...      4        4
...                                                   ...    ...      ...
132008  vernielingen diefstal aanrandingen niemand vei...      4        4
132009                                                         0        0
132010  sjezen vliegtuig halen zulke tsunami zweten ze...      5        4
132011                                                         0        0
132012  spectaculaire beelden zwolle waar weekend twee...      5        9

[132013 rows x 3 columns]


In [21]:
sum(cluster_df['subtopic'].isna())

0

In [22]:
npo['topic'] = cluster_df['topic']

In [23]:
npo['subtopic'] = cluster_df['subtopic']

In [24]:
npo.to_csv('NPO_topics.csv')