# Alternative vægtning af ord: Tf-idf

In [1]:
# prep
import pandas as pd

redditdata_url = "https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark-comments_01032021-08032021_long.csv"
reddit_df = pd.read_csv(redditdata_url)

# Lagr kommentarer i objekt for sig
comments = list(reddit_df['comment_body'])

In [2]:
# Tf-idf vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("da_core_news_sm")

custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']

stops = list(nlp.Defaults.stop_words) + custom_stops

# Indstil tfidf vectorizer - samme indstillinger som før
vectorizer = TfidfVectorizer(stop_words = stops, max_df = 0.7, norm = False)
transformed_documents = vectorizer.fit_transform(comments)

transformed_documents_as_array = transformed_documents.toarray()

# Konverter array til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum()
word_tfidfsum.sort_values(ascending = False)[0:50]



and             1030.215344
godt             956.913994
0a               889.633384
folk             824.060511
tror             784.250825
danmark          702.441635
ja               673.972109
mener            576.901141
reddit           521.875816
post             521.061035
mennesker        508.005891
altså            506.789185
this             506.301846
10               505.611269
your             493.331144
was              460.064083
they             451.199899
nej              444.399787
gerne            444.041004
hudfarve         434.692586
with             433.242678
can              430.311353
tid              416.148416
or               401.557440
faktisk          397.597595
store            395.447939
gå               394.361830
gang             389.137133
finde            387.448525
youtube          386.872070
spørgsmål        383.697482
blevet           380.980331
penge            380.020513
on               379.686488
giver            364.751838
removed          364

## Tf-idf vectorizer på eksisterende tokens

In [3]:
# Funktion brugt til at tokenize data

import spacy
nlp = spacy.load("da_core_news_sm", disable = ['parser', 'ner', 'textcat'])

def tokenizer_spacy(text):
    custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']
    stop_words = list(nlp.Defaults.stop_words) + custom_stops
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for word in doc:
        if (len(word.lemma_) == 1):
            continue
        if (word.pos_ in pos_tags) and (word.lemma_.lower() not in stop_words):
            tokens.append(word.lemma_.lower())
                
    return(tokens)

In [4]:
reddit_df['comment_tokens'] = reddit_df['comment_body'].apply(tokenizer_spacy)

In [5]:
# Danner kopi af data

reddit_df_tokenized = reddit_df.copy()

In [6]:
# Evt. indlæs allerede eksisterende tokenized data
#import ast
#reddit_df_tokenized = pd.read_csv("https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark_q=danmark_01012020-30062020_long_filtered_tokenized.zip")
#reddit_df_tokenized['tokens'] = reddit_df_tokenized['tokens'].apply(ast.literal_eval)

In [7]:
# Tokenize data
reddit_df_tokenized = reddit_df_tokenized.loc[reddit_df_tokenized['comment_tokens'].apply(lambda tokens: len(tokens) > 1), :]

# Lagr kommentarer for sig
comments_tokens = list(reddit_df_tokenized['comment_tokens'])

In [8]:
# Tfidfvectorizer på tokens
from sklearn.feature_extraction.text import TfidfVectorizer

# Dummyfunktion - bruges som tokenizer-funktion i vectorizer
def return_tokens(tokens):
    return tokens

# Indstiller vectorizer med brug af dummyfunktion (returnerer blot tokens, da data allerede er tokenized)
vectorizer = TfidfVectorizer(
    tokenizer=return_tokens,
    preprocessor=return_tokens,
    token_pattern=None,
    norm = False)

# Fitter vectorizer
transformed_documents = vectorizer.fit_transform(comments_tokens)

# Konverter til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum().sort_values(ascending = False)
word_tfidfsum[0:50]



stor           901.406928
mangen         838.125205
folk           837.013857
danmark        669.056656
gang           578.774272
dag            555.953102
tid            544.925037
menneske       533.505473
sted           526.474988
problem        514.573896
land           508.198903
dansk          501.083232
post           489.244964
this           485.154182
megen          437.335389
penge          433.956820
:)             423.219724
måde           392.845121
spørgsmål      381.216330
hudfarve       377.147660
your           376.522772
barn           374.841476
can            363.060509
enig           362.971874
person         353.184309
they           346.897431
kommentar      338.380978
racisme        330.677148
woke           322.475365
rette          317.033151
parre          313.326193
with           310.564095
side           309.919958
tak            304.622722
sidste         302.260032
måned          294.094532
parti          292.922775
forhold        291.092509
if          