# Koder brugt i video 1-6 til Flipped Classroom 2

## Alternative tokenizers

### stanza - med det hele

In [None]:
# Pakker

import stanza
import pandas as pd
from nltk.corpus import stopwords

# Download ressourcer
#nltk.download('stopwords')
#stanza.download('da')

In [None]:
# Indlæs data

redditdata_url = "https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark_q=danmark_01012020-15032021_long.zip"
reddit_df = pd.read_csv(redditdata_url)

In [None]:
# Filtrer data

from datetime import datetime

filter_start = int(datetime(2020,1,1,0,0).timestamp())
filter_end = int(datetime(2020,7,1,0,0).timestamp())

reddit_df = reddit_df.loc[reddit_df['post_num_comments'].astype(int) > 5, :]
reddit_df = reddit_df.loc[(reddit_df['post_created_utc'] >= filter_start) & (reddit_df['post_created_utc'] < filter_end), :]
reddit_df = reddit_df.loc[reddit_df['comment_body'].str.len() > 30, :]

reddit_df.shape

In [None]:
# Definer tokenizer

nlp = stanza.Pipeline('da')

def tokenizer_stanza(text): # Definerer funktion ud fra koden fra tidligere    
    
    stop_words = list(stopwords.words('danish'))
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.lemma) < 2):
                continue
            if (word.pos in pos_tags) and (word.lemma not in stop_words):
                tokens.append(word.lemma)
                
    return(tokens)

In [None]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza)

In [None]:
reddit_sample['tokens'].head()

### stanza - kun tokenizer

In [None]:
# Definer tokenizer

nlp = stanza.Pipeline('da', processors = 'tokenize')

def tokenizer_stanza_simple(text): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(stopwords.words('danish'))

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.text) < 2):
                continue
            if word.text.lower() not in stop_words:
                tokens.append(word.text.lower())
                
    return(tokens)

In [None]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza_simple)

In [None]:
reddit_sample['tokens'].head()

### Spacy - Kun tokenizer

In [1]:
import spacy

#!python -m spacy download da_core_news_sm # download sprogmodel

In [2]:
nlp = spacy.load("da_core_news_sm")

list(nlp.Defaults.stop_words)

['jer',
 'efter',
 'hvorefter',
 'gør',
 'være',
 'her',
 'for',
 'lavet',
 'vi',
 'ved',
 'derfra',
 'men',
 'vær',
 'det',
 'ham',
 'ses',
 'samme',
 'blive',
 'sammen',
 'hvornår',
 'ene',
 'jo',
 'jeg',
 'mest',
 'hvilken',
 'anden',
 'allerede',
 'skal',
 'selv',
 'min',
 'mange',
 'alle',
 'dens',
 'en',
 'ingen',
 'forrige',
 'bliver',
 'ligesom',
 'under',
 'mere',
 'i',
 'vil',
 'har',
 'lav',
 'over',
 'egen',
 'tidligere',
 'du',
 'tilbage',
 'måske',
 'henover',
 'den',
 'dermed',
 'hvem',
 'havde',
 'ny',
 'mine',
 'end',
 'derpå',
 'kom',
 'via',
 'hendes',
 'dine',
 'flere',
 'nogensinde',
 'jeres',
 'ud',
 'var',
 'flest',
 'kan',
 'langs',
 'mig',
 'nogle',
 'god',
 'alligevel',
 'kommer',
 'derefter',
 'derfor',
 'og',
 'hans',
 'mit',
 'herefter',
 'derved',
 'hver',
 'dem',
 'mindre',
 'næste',
 'lad',
 'lidt',
 'begge',
 'mens',
 'fordi',
 'hermed',
 'intet',
 'gøre',
 'de',
 'kun',
 'lille',
 'nær',
 'lige',
 'eneste',
 'gennem',
 'heller',
 'enten',
 'hvor',
 'så

In [None]:
# Definer tokenizer funktion 

nlp = spacy.load("da_core_news_sm")

def tokenizer_spacy_simple(text):
    
    stop_words = list(nlp.Defaults.stop_words)

    doc = nlp.tokenizer(text)

    tokens = []

    for word in doc:
        if (len(word.text) < 2):
            continue
        if word.text.lower() not in stop_words:
            tokens.append(word.text.lower())

    return(tokens)

In [None]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_spacy_simple)

In [None]:
reddit_sample['tokens'].head()

### sklearn tokenizer (fra CountVectorizer)

In [None]:
# Definer tokenizer funktion

from sklearn.feature_extraction.text import CountVectorizer

tokenizer = CountVectorizer().build_tokenizer()

def tokenizer_sklearn(text):
    stop_words = list(nlp.Defaults.stop_words)
    
    words = tokenizer(text)
    
    tokens = []
    
    for word in words:
        if (len(word) < 2):
            continue
        if word.lower() not in stop_words:
            tokens.append(word.lower())
    
    return(tokens)

In [None]:
reddit_sample = reddit_df.sample(100, random_state = 142)
reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_sklearn)

In [None]:
reddit_sample['tokens'].head()

### Sammenligning af tokenizers

Nedenstående kode opretter test-funktion for hver tokenizer. Test-funktionen kører tokenizeren på 100 reddit posts og viser, hvor lang tid tokenization tager i sekunder.

In [None]:
import stanza
import spacy
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import time

nlp_stanza = stanza.Pipeline('da')
nlp_stanza_simple = stanza.Pipeline('da', processors = 'tokenize')
nlp_spacy_simple = spacy.load("da_core_news_sm")
sklearn_tokenizer = CountVectorizer().build_tokenizer()

def tokenizer_stanza(text, nlp = nlp_stanza): # Definerer funktion ud fra koden fra tidligere    

    stop_words = list(nlp_spacy_simple.Defaults.stop_words)
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.lemma) < 2):
                continue
            if (word.pos in pos_tags) and (word.lemma not in stop_words):
                tokens.append(word.lemma)

    return(tokens)


def tokenizer_stanza_simple(text, nlp = nlp_stanza_simple): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(nlp_spacy_simple.Defaults.stop_words)

    doc = nlp(text)

    tokens = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if (len(word.text) < 2):
                continue
            if word.text.lower() not in stop_words:
                tokens.append(word.text.lower())
                
    return(tokens)

def tokenizer_spacy_simple(text, nlp = nlp_spacy_simple): # Definerer funktion ud fra koden fra tidligere
    
    stop_words = list(nlp_spacy_simple.Defaults.stop_words)

    doc = nlp.tokenizer(text)

    tokens = []

    for word in doc:
        if (len(word.text) < 2):
            continue
        if word.text.lower() not in stop_words:
            tokens.append(word.text.lower())

    return(tokens)

def tokenizer_sklearn(text, tokenizer = sklearn_tokenizer):
    stop_words = list(nlp_spacy_simple.Defaults.stop_words)
    
    words = tokenizer(text)
    
    tokens = []
    
    for word in words:
        if (len(word) < 2):
            continue
        if word.lower() not in stop_words:
            tokens.append(word.lower())
    
    return(tokens)

def stanza_full_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza)
    
    print("stanza full: {0:.2f} seconds".format(time.time()-start_time))
    
def stanza_simple_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_stanza_simple)
    
    print("stanza simple: {0:.2f} seconds".format(time.time()-start_time))
    
def spacy_simple_tester():
    start_time = time.time()
    
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_spacy_simple)
    
    print("spacy simple: {0:.2f} seconds".format(time.time()-start_time))
    
def sklearn_tester():
    start_time = time.time()
          
    reddit_sample = reddit_df.sample(100, random_state = 142)
    reddit_sample['tokens'] = reddit_sample['comment_body'].apply(tokenizer_sklearn)
    
    print("sklearn: {0:.2f} seconds".format(time.time()-start_time))

In [None]:
stanza_full_tester()
stanza_simple_tester()
spacy_simple_tester()
sklearn_tester()

## Ordoptælling med vectorizers

In [None]:
# Lagr kommentarer i objekt for sig

comments = list(reddit_df['comment_body'])

len(comments)

### CountVectorizer

In [None]:
# Countvectorizer på kommentarer - rå tekst

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
transformed_documents = vectorizer.fit_transform(comments)

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

len(transformed_documents_as_array)

In [None]:
# Konverter array til document-term matrix

df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

In [None]:
df.head()

In [None]:
# Optælling af ord på tværs af dokumenter

word_count = df.sum()
word_count.sort_values(ascending = False)[0:20]

### ConuntVectorizer med stopord og dokumentgrænser

In [None]:
# Indlæser spacy for at bruge spacy stopordsliste

import spacy
nlp = spacy.load("da_core_news_sm")

custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']

stops = list(nlp.Defaults.stop_words) + custom_stops

# Indstiller vectorizer - stopord og maksimalt antal dokumenter, ord må indgå i (max. 70%)
vectorizer = CountVectorizer(stop_words = stops, max_df = 0.7)
transformed_documents = vectorizer.fit_transform(comments)

transformed_documents_as_array = transformed_documents.toarray()

# Konverter array til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_count = df.sum()
word_count.sort_values(ascending = False)[0:50]

## Alternative vægtning af ord: Tf-idf

In [None]:
# Tf-idf vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("da_core_news_sm")

custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']

stops = list(nlp.Defaults.stop_words) + custom_stops

# Indstil tfidf vectorizer - samme indstillinger som før
vectorizer = TfidfVectorizer(stop_words = stops, max_df = 0.7, norm = False)
transformed_documents = vectorizer.fit_transform(comments)

transformed_documents_as_array = transformed_documents.toarray()

# Konverter array til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum()
word_tfidfsum.sort_values(ascending = False)[0:50]

## Tf-idf vectorizer på eksisterende tokens

In [None]:
# Funktion brugt til at tokenize data

import spacy
nlp = spacy.load("da_core_news_sm", disable = ['parser', 'ner', 'textcat'])

def tokenizer_spacy(text):
    custom_stops = ['gt', 'bare', 'the', 'to', 'når', 'https', 'helt', 'of', 'se', 'in', 'www', 'is', 'you', 'dk', 'får', 'com', 'ret', 'it', 'that', 'år', 'siger',
               'hele', 'går', 'ting', 'ser', 'del', 'vel', 'tage', 'set', 'are', 'be', 'not', 'but', 'amp']
    stop_words = list(nlp.Defaults.stop_words) + custom_stops
    pos_tags = ['PROPN', 'ADJ', 'NOUN']

    doc = nlp(text)

    tokens = []

    for word in doc:
        if (len(word.lemma_) == 1):
            continue
        if (word.pos_ in pos_tags) and (word.lemma_.lower() not in stop_words):
            tokens.append(word.lemma_.lower())
                
    return(tokens)

In [None]:
reddit_df['comment_tokens'] = reddit_df['comment_body'].apply(tokenizer_spacy)

In [None]:
# Danner kopi af data

reddit_df_tokenized = reddit_df.copy()

In [None]:
# Evt. indlæs allerede eksisterende tokenized data
#import ast
#reddit_df_tokenized = pd.read_csv("https://raw.githubusercontent.com/CALDISS-AAU/course_ndms-I/master/datasets/reddit_rdenmark_q=danmark_01012020-30062020_long_filtered_tokenized.zip")
#reddit_df_tokenized['tokens'] = reddit_df_tokenized['tokens'].apply(ast.literal_eval)

In [None]:
# Tokenize data
reddit_df_tokenized = reddit_df_tokenized.loc[reddit_df_tokenized['comment_tokens'].apply(lambda tokens: len(tokens) > 1), :]

# Lagr kommentarer for sig
comments_tokens = list(reddit_df_tokenized['comment_tokens'])

In [None]:
# Tfidfvectorizer på tokens
from sklearn.feature_extraction.text import TfidfVectorizer

# Dummyfunktion - bruges som tokenizer-funktion i vectorizer
def return_tokens(tokens):
    return tokens

# Indstiller vectorizer med brug af dummyfunktion (returnerer blot tokens, da data allerede er tokenized)
vectorizer = TfidfVectorizer(
    tokenizer=return_tokens,
    preprocessor=return_tokens,
    token_pattern=None,
    norm = False)

# Fitter vectorizer
transformed_documents = vectorizer.fit_transform(comments_tokens)

# Konverter til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til document-term matrix
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Ordoptælling
word_tfidfsum = df.sum().sort_values(ascending = False)
word_tfidfsum[0:50]

## Fra tekst til features

### Dummy-variabel for upvoted/ikke upvoted

In [None]:
# Danner kopi af data

reddit_df_rf = reddit_df_tokenized.copy()

In [None]:
# Tjekker indhold af variabel "comment_score"
reddit_df_rf['comment_score'].head()

In [None]:
# Danner dummy for upvoted
reddit_df_rf['comment_upvoted'] = reddit_df_rf['comment_score'] > 1

In [None]:
# Tjekker indhold af ny variabel
reddit_df_rf['comment_upvoted'].head()

In [None]:
# Optælling på ny variabel
reddit_df_rf['comment_upvoted'].value_counts()

In [None]:
# Variabel for downvoted
reddit_df_rf['comment_downvoted'] = reddit_df_rf['comment_score'] < 1

# Optælling
reddit_df_rf['comment_downvoted'].value_counts()

### Fra tekst til dummies

In [None]:
# Danner ordliste af 50 mest hyppige ord baseret på tfidf fra tidligere
top_words = list(word_tfidfsum.index[0:50])

In [None]:
top_words

In [None]:
# Loop igennem hvert ord i topwords og dan dummyvariabel for hvorvidt ord indgår i kommentar eller ej (ud fra token-liste)
for word in top_words:
    colname = "token_{}".format(word) # Denne linje giver dummyvariabel for ord præfix "token_"
    reddit_df_rf[colname] = reddit_df_rf['comment_tokens'].apply(lambda tokens: int(word in tokens))

In [None]:
reddit_df_rf.head()

In [None]:
# Tjek dummyvariable for tekst
[column for column in reddit_df_rf.columns if column.startswith('token_')]