# Koder brugt i video 1-6 til Flipped Classroom 2

## Alternative tokenizers

### stanza - med det hele

In [1]:
# Pakker

import stanza
import pandas as pd
from nltk.corpus import stopwords

# Download ressourcer
#nltk.download('stopwords')
#stanza.download('da')

In [2]:
# Indlæs data

redditdata_url = "https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark_q=danmark_01012020-15032021_long.zip"
reddit_df = pd.read_csv(redditdata_url)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Filtrer data

from datetime import datetime

filter_start = int(datetime(2020,1,1,0,0).timestamp())
filter_end = int(datetime(2020,7,1,0,0).timestamp())

reddit_df = reddit_df.loc[reddit_df['post_num_comments'].astype(int) > 5, :]
reddit_df = reddit_df.loc[(reddit_df['post_created_utc'] >= filter_start) & (reddit_df['post_created_utc'] < filter_end), :]
reddit_df = reddit_df.loc[reddit_df['comment_body'].str.len() > 30, :]

reddit_df.shape

(27918, 52)

In [76]:
# Definer tokenizer

nlp = stanza.Pipeline('da')

def tokenizer_stanza(text): # Definerer funktion ud fra koden fra tidligere    
    
    stop_words = list(stopwords.words('danish'))
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.lemma) < 2):
                continue
            if (word.pos in pos_tags) and (word.lemma not in stop_words):
                tokens.append(word.lemma)
                
    return(tokens)

2021-03-18 11:07:18 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |
| lemma     | ddt     |
| depparse  | ddt     |

2021-03-18 11:07:18 INFO: Use device: cpu
2021-03-18 11:07:18 INFO: Loading: tokenize
2021-03-18 11:07:18 INFO: Loading: pos
2021-03-18 11:07:19 INFO: Loading: lemma
2021-03-18 11:07:19 INFO: Loading: depparse
2021-03-18 11:07:20 INFO: Done loading processors!


In [77]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza)

In [78]:
reddit_sample['tokens'].head()

15382      [&gt, sjov, spøg, spøg, alvor, del, Piet, Hein]
19594                                               [Dude]
8085     [lillebrors, forhold, enig, kraftig, forholdsr...
2113     [bund, artikel, sag, sjælden, Danmark, sen, sa...
16826                                   [glad, BIG, These]
Name: tokens, dtype: object

### stanza - kun tokenizer

In [79]:
# Definer tokenizer

nlp = stanza.Pipeline('da', processors = 'tokenize')

def tokenizer_stanza_simple(text): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(stopwords.words('danish'))

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.text) < 2):
                continue
            if word.text.lower() not in stop_words:
                tokens.append(word.text.lower())
                
    return(tokens)

2021-03-18 11:07:38 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |

2021-03-18 11:07:38 INFO: Use device: cpu
2021-03-18 11:07:38 INFO: Loading: tokenize
2021-03-18 11:07:38 INFO: Done loading processors!


In [80]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza_simple)

In [81]:
reddit_sample['tokens'].head()

15382    [&gt, ingen, virker, sjove, kun, tar, spøg, sp...
19594    [dude, you’re, such, huge, influence, and, fac...
8085     [wow, fejrer, lige, lillebrors, ødelagte, forh...
2113     [læser, bunden, artiklen, så, sager, sjældne, ...
16826    [glad, you, like, it, here, agree, on, your, t...
Name: tokens, dtype: object

### Spacy - Kun tokenizer

In [82]:
import spacy

#!python -m spacy download da_core_news_sm # download sprogmodel

In [83]:
# Definer tokenizer funktion 

nlp = spacy.load("da_core_news_sm")

def tokenizer_spacy_simple(text):
    
    stop_words = list(nlp.Defaults.stop_words)

    doc = nlp.tokenizer(text)

    tokens = []

    for word in doc:
        if (len(word.text) < 2):
            continue
        if word.text.lower() not in stop_words:
            tokens.append(word.text.lower())

    return(tokens)

In [84]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_spacy_simple)

In [85]:
reddit_sample['tokens'].head()

15382    [gt, ingen, virker, sjove, \n\n, kun,  \n, tar...
19594    [dude, you, re, such, huge, influence, and, fa...
8085     [wow, fejrer, lige, lillebrors, ødelagte, forh...
2113     [læser, bunden, artiklen, så, sager, sjældne, ...
16826    [glad, you, like, it, here, agree, on, your, t...
Name: tokens, dtype: object

### sklearn tokenizer (fra CountVectorizer)

In [86]:
# Definer tokenizer funktion

from sklearn.feature_extraction.text import CountVectorizer

tokenizer = CountVectorizer().build_tokenizer()

def tokenizer_sklearn(text):
    stop_words = list(nlp.Defaults.stop_words)
    
    words = tokenizer(text)
    
    tokens = []
    
    for word in words:
        if (len(word) < 2):
            continue
        if word.lower() not in stop_words:
            tokens.append(word.lower())
    
    return(tokens)

In [87]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_sklearn)

In [88]:
reddit_sample['tokens'].head()

15382    [gt, ingen, virker, sjove, kun, tar, spøg, spø...
19594    [dude, you, re, such, huge, influence, and, fa...
8085     [wow, fejrer, lige, lillebrors, ødelagte, forh...
2113     [læser, bunden, artiklen, så, sager, sjældne, ...
16826    [glad, you, like, it, here, agree, on, your, t...
Name: tokens, dtype: object

### Sammenligning af tokenizers

Nedenstående kode opretter test-funktion for hver tokenizer. Test-funktionen kører tokenizeren på 100 reddit posts og viser, hvor lang tid tokenization tager i sekunder.

In [89]:
import stanza
import spacy
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import time

nlp_stanza = stanza.Pipeline('da')
nlp_stanza_simple = stanza.Pipeline('da', processors = 'tokenize')
nlp_spacy_simple = spacy.load("da_core_news_sm")
sklearn_tokenizer = CountVectorizer().build_tokenizer()

def tokenizer_stanza(text, nlp = nlp_stanza): # Definerer funktion ud fra koden fra tidligere    

    stop_words = list(nlp.Defaults.stop_words)
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.lemma) < 2):
                continue
            if (word.pos in pos_tags) and (word.lemma not in stop_words):
                tokens.append(word.lemma)

    return(tokens)


def tokenizer_stanza_simple(text, nlp = nlp_stanza_simple): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(nlp.Defaults.stop_words)

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.text) < 2):
                continue
            if word.text.lower() not in stop_words:
                tokens.append(word.text.lower())
                
    return(tokens)

def tokenizer_spacy_simple(text, nlp = nlp_spacy_simple): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(nlp.Defaults.stop_words)

    doc = nlp.tokenizer(text)

    tokens = []

    for word in doc:
        if (len(word.text) < 2):
            continue
        if word.text.lower() not in stop_words:
            tokens.append(word.text.lower())

    return(tokens)

def tokenizer_sklearn(text, tokenizer = sklearn_tokenizer):
    stop_words = list(nlp.Defaults.stop_words)
    
    words = tokenizer(text)
    
    tokens = []
    
    for word in words:
        if (len(word) < 2):
            continue
        if word.lower() not in stop_words:
            tokens.append(word.lower())
    
    return(tokens)

def stanza_full_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza)
    
    print("stanza full: {0:.2f} seconds".format(time.time()-start_time))
    
def stanza_simple_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza_simple)
    
    print("stanza simple: {0:.2f} seconds".format(time.time()-start_time))
    
def spacy_simple_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_spacy_simple)
    
    print("spacy simple: {0:.2f} seconds".format(time.time()-start_time))
    
def sklearn_tester():
    start_time = time.time()
          
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_sklearn)
    
    print("sklearn: {0:.2f} seconds".format(time.time()-start_time))

2021-03-18 11:07:46 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |
| lemma     | ddt     |
| depparse  | ddt     |

2021-03-18 11:07:46 INFO: Use device: cpu
2021-03-18 11:07:46 INFO: Loading: tokenize
2021-03-18 11:07:46 INFO: Loading: pos
2021-03-18 11:07:46 INFO: Loading: lemma
2021-03-18 11:07:46 INFO: Loading: depparse
2021-03-18 11:07:47 INFO: Done loading processors!
2021-03-18 11:07:47 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |

2021-03-18 11:07:47 INFO: Use device: cpu
2021-03-18 11:07:47 INFO: Loading: tokenize
2021-03-18 11:07:47 INFO: Done loading processors!


In [90]:
stanza_full_tester()
stanza_simple_tester()
spacy_simple_tester()
sklearn_tester()

stanza full: 17.05 seconds
stanza simple: 5.80 seconds
spacy simple: 0.10 seconds
sklearn: 0.03 seconds


## Ordoptælling med vectorizers

In [91]:
# Lagr kommentarer i objekt for sig

comments = list(reddit_df['comment_body'])

len(comments)

### CountVectorizer

In [93]:
# Countvectorizer på kommentarer - rå tekst

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
transformed_documents = vectorizer.fit_transform(comments)

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

len(transformed_documents_as_array)

27918

In [94]:
# Konverter array til document-term matrix

df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

In [95]:
df.head()

Unnamed: 0,00,000,0000001,0000034,000008,000008_da,00001,0001,000km,000kr,...,światową,ƨi,ʇɉw,μm,ϱniʜɉγɿɘvǝ,ಠ_ಠ,加油,老外,ﾟヮﾟ,𝓷𝓲𝓬𝓮
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
# Optælling af ord på tværs af dokumenter

word_count = df.sum()
word_count.sort_values(ascending = False)[0:20]

er      43383
det     41008
at      38863
og      26005
ikke    23544
en      20117
jeg     19798
der     17374
på      15883
har     15509
til     15215
for     15075
så      14923
de      14857
af      12804
med     12352
du      12218
som     11294
kan     10505
den     10003
dtype: int64

### ConuntVectorizer med stopord og dokumentgrænser

In [109]:
# Indlæser spacy for at bruge spacy stopordsliste

import spacy
nlp = spacy.load("da_core_news_sm")

custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']

stops = list(nlp.Defaults.stop_words) + custom_stops

# Indstiller vectorizer - stopord og maksimalt antal dokumenter, ord må indgå i (max. 70%)
vectorizer = CountVectorizer(stop_words = stops, max_df = 0.7)
transformed_documents = vectorizer.fit_transform(comments)

transformed_documents_as_array = transformed_documents.toarray()

# Konverter array til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_count = df.sum()
word_count.sort_values(ascending = False)[0:50]

danmark         3436
folk            3159
godt            2764
and             2219
tror            2110
ja              1740
mennesker       1293
mener           1241
usa             1118
nej             1097
danske          1089
racisme         1078
altså           1042
faktisk          996
bedre            988
tid              963
mod              926
dag              925
denmark          922
måde             908
giver            904
lande            896
langt            869
dansk            864
gå               843
penge            828
arbejde          823
komme            809
selvfølgelig     793
tager            790
blevet           786
eu               764
bruge            738
gang             730
gerne            730
hvilket          718
dit              716
finde            711
10               688
reddit           686
corona           682
stor             680
land             677
kina             638
står             634
on               629
virker           626
tak          

## Alternative vægtning af ord: Tf-idf

In [110]:
# Tf-idf vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("da_core_news_sm")

custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']

stops = list(nlp.Defaults.stop_words) + custom_stops

# Indstil tfidf vectorizer - samme indstillinger som før
vectorizer = TfidfVectorizer(stop_words = stops, max_df = 0.7, norm = False)
transformed_documents = vectorizer.fit_transform(comments)

transformed_documents_as_array = transformed_documents.toarray()

# Konverter array til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum()
word_tfidfsum.sort_values(ascending = False)[0:50]

danmark         11813.412720
folk            10874.034857
and              9584.653471
godt             9530.333742
tror             7848.233386
ja               6703.267100
mennesker        5559.575015
mener            5289.866531
racisme          5163.357913
usa              5010.086488
danske           4865.021211
nej              4738.334771
altså            4555.847990
faktisk          4414.718427
bedre            4413.811714
denmark          4320.475777
tid              4308.782903
mod              4258.206324
dag              4236.810999
lande            4170.913690
måde             4096.922420
giver            4095.301164
dansk            4057.372288
langt            3976.967716
eu               3940.046972
penge            3926.208068
arbejde          3900.016426
gå               3859.059115
komme            3725.494318
blevet           3669.822075
selvfølgelig     3669.776803
tager            3667.692931
bruge            3490.581350
reddit           3484.937455
gang          

## Tf-idf vectorizer på eksisterende tokens

In [125]:
# Funktion brugt til at tokenize data

import spacy
nlp = spacy.load("da_core_news_sm", disable = ['parser', 'ner', 'textcat'])

def tokenizer_spacy(text):
    custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']
    stop_words = list(nlp.Defaults.stop_words) + custom_stops
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for word in doc:
        if (len(word.lemma_) == 1):
            continue
        if (word.pos_ in pos_tags) and (word.lemma_.lower() not in stop_words):
            tokens.append(word.lemma_.lower())
                
    return(tokens)

In [127]:
reddit_df['comment_tokens'] = reddit_df['comment_body'].apply(tokenizer_spacy)

In [143]:
# Danner kopi af data

reddit_df_tokenized = reddit_df.copy()

In [4]:
# Evt. indlæs allerede eksisterende tokenized data
#import ast
#reddit_df_tokenized = pd.read_csv("https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark_q=danmark_01012020-30062020_long_filtered_tokenized.zip")
#reddit_df_tokenized['tokens'] = reddit_df_tokenized['tokens'].apply(ast.literal_eval)

In [133]:
# Tokenize data
reddit_df_tokenized = reddit_df_tokenized.loc[reddit_df_tokenized['comment_tokens'].apply(lambda tokens: len(tokens) > 1), :]

# Lagr kommentarer for sig
comments_tokens = list(reddit_df_tokenized['comment_tokens'])

In [141]:
# Tfidfvectorizer på tokens
from sklearn.feature_extraction.text import TfidfVectorizer

# Dummyfunktion - bruges som tokenizer-funktion i vectorizer
def return_tokens(tokens):
    return tokens

# Indstiller vectorizer med brug af dummyfunktion (returnerer blot tokens, da data allerede er tokenized)
vectorizer = TfidfVectorizer(
    tokenizer=return_tokens,
    preprocessor=return_tokens,
    token_pattern=None,
    norm = False)

# Fitter vectorizer
transformed_documents = vectorizer.fit_transform(comments_tokens)

# Konverter til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum().sort_values(ascending = False)
word_tfidfsum[0:50]

danmark       11864.229736
folk          11332.328445
mangen         8754.827562
land           8188.696520
stor           7813.605367
dansk          7267.325863
dag            6757.897987
menneske       6211.115318
tid            5665.105386
sted           5524.909331
problem        5522.579739
gang           5410.667171
racisme        5068.948012
usa            4853.015416
måde           4584.348719
megen          4562.519161
samfund        4542.027532
penge          4400.695436
dansker        4239.988622
person         4221.863569
denmark        3829.568910
mand           3779.783923
enig           3599.114109
verden         3389.727459
kina           3357.457118
grund          3345.828030
barn           3317.060691
rette          3262.258413
uge            3246.863997
arbejde        3198.537319
virksomhed     3134.775243
høj            3120.887727
eu             3098.205092
mening         3092.826105
side           3031.313043
tak            2992.252299
stat           2981.849240
p

## Fra tekst til features

### Dummy-variabel for upvoted/ikke upvoted

In [157]:
# Danner kopi af data

reddit_df_rf = reddit_df_tokenized.copy()

In [158]:
# Tjekker indhold af variabel "comment_score"
reddit_df_rf['comment_score'].head()

3    1
4    1
5    1
6    1
7    1
Name: comment_score, dtype: int64

In [159]:
# Danner dummy for upvoted
reddit_df_rf['comment_upvoted'] = reddit_df_rf['comment_score'] > 1

In [160]:
# Tjekker indhold af ny variabel
reddit_df_rf['comment_upvoted'].head()

3    False
4    False
5    False
6    False
7    False
Name: comment_upvoted, dtype: bool

In [162]:
# Optælling på ny variabel
reddit_df_rf['comment_upvoted'].value_counts()

False    23283
True      2827
Name: comment_upvoted, dtype: int64

In [163]:
# Variabel for downvoted
reddit_df_rf['comment_downvoted'] = reddit_df_rf['comment_score'] < 1

# Optælling
reddit_df_rf['comment_downvoted'].value_counts()

False    25442
True       668
Name: comment_downvoted, dtype: int64

### Fra tekst til dummies

In [164]:
# Danner ordliste af 50 mest hyppige ord baseret på tfidf fra tidligere
top_words = list(word_tfidfsum.index[0:50])

In [165]:
top_words

['danmark',
 'folk',
 'mangen',
 'land',
 'stor',
 'dansk',
 'dag',
 'menneske',
 'tid',
 'sted',
 'problem',
 'gang',
 'racisme',
 'usa',
 'måde',
 'megen',
 'samfund',
 'penge',
 'dansker',
 'person',
 'denmark',
 'mand',
 'enig',
 'verden',
 'kina',
 'grund',
 'barn',
 'rette',
 'uge',
 'arbejde',
 'virksomhed',
 'høj',
 'eu',
 'mening',
 'side',
 'tak',
 'stat',
 'parre',
 'situation',
 'sort',
 'forhold',
 'eksempel',
 'this',
 'sidste',
 'skatte',
 'sverige',
 'system',
 'lov',
 'by',
 'spørgsmål']

In [167]:
# Loop igennem hvert ord i topwords og dan dummyvariabel for hvorvidt ord indgår i kommentar eller ej (ud fra token-liste)
for word in top_words:
    colname = "token_{}".format(word) # Denne linje giver dummyvariabel for ord præfix "token_"
    reddit_df_rf[colname] = reddit_df_rf['comment_tokens'].apply(lambda tokens: int(word in tokens))

In [171]:
reddit_df_rf.head()

Unnamed: 0,post_author,post_created_utc,post_domain,post_full_link,post_gildings,post_id,post_is_original_content,post_is_reddit_media_domain,post_locked,post_media_only,...,token_forhold,token_eksempel,token_this,token_sidste,token_skatte,token_sverige,token_system,token_lov,token_by,token_spørgsmål
3,LotusLemmedasker,1577901072,self.Denmark,https://www.reddit.com/r/Denmark/comments/eil6...,{},eil6k2,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,LotusLemmedasker,1577901072,self.Denmark,https://www.reddit.com/r/Denmark/comments/eil6...,{},eil6k2,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
5,LotusLemmedasker,1577901072,self.Denmark,https://www.reddit.com/r/Denmark/comments/eil6...,{},eil6k2,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
6,LotusLemmedasker,1577901072,self.Denmark,https://www.reddit.com/r/Denmark/comments/eil6...,{},eil6k2,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
7,LotusLemmedasker,1577901072,self.Denmark,https://www.reddit.com/r/Denmark/comments/eil6...,{},eil6k2,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [170]:
# Tjek dummyvariable for tekst
[column for column in reddit_df_rf.columns if column.startswith('token_')]

['token_danmark',
 'token_folk',
 'token_mangen',
 'token_land',
 'token_stor',
 'token_dansk',
 'token_dag',
 'token_menneske',
 'token_tid',
 'token_sted',
 'token_problem',
 'token_gang',
 'token_racisme',
 'token_usa',
 'token_måde',
 'token_megen',
 'token_samfund',
 'token_penge',
 'token_dansker',
 'token_person',
 'token_denmark',
 'token_mand',
 'token_enig',
 'token_verden',
 'token_kina',
 'token_grund',
 'token_barn',
 'token_rette',
 'token_uge',
 'token_arbejde',
 'token_virksomhed',
 'token_høj',
 'token_eu',
 'token_mening',
 'token_side',
 'token_tak',
 'token_stat',
 'token_parre',
 'token_situation',
 'token_sort',
 'token_forhold',
 'token_eksempel',
 'token_this',
 'token_sidste',
 'token_skatte',
 'token_sverige',
 'token_system',
 'token_lov',
 'token_by',
 'token_spørgsmål']