# 1. Data scraping

In [None]:
# To begin with, we install and import the twint library which we'll use to scrape Twitter data (tweets):

%pip install --upgrade git+https://github.com/kevctae/twint.git 
import twint

In [None]:
# When using a Python notebook, this code is needed in order to avoid
# compatibility issues with notebooks and RunTime errors:

import nest_asyncio
nest_asyncio.apply()

In [None]:
# We set the parameters of the search, specifying that we're interested 
# in Danish (da) tweets containing the hashtag "#metoo"


config = twint.Config()
config.Pandas = True
config.Search = "#metoo"
config.Lang = "da"

In [None]:
# Here we carry out the actual search for Danish tweets containing the hashtag "#metoo":

twint.run.Search(config)

In [None]:
# We install and import the pandas library in order to be able to save the tweets in a dataframe:

%pip install pandas

import pandas as pd

In [None]:
# This stores all the tweets + metainformation (username, time and date of tweets etc.) in a pandas dataframe:

df = twint.storage.panda.Tweets_df  

In [None]:
# Here we save the dataframe as a csv file:

df.to_csv('twitter_corpus.csv')

# 2. Data cleaning

In [None]:
# We install and import the necessary libraries needed to clean the tweets, including the 
# Danish lemmatization module "da_core_news_lg":

%pip install pandas
%pip install spacy

import pandas as pd
import spacy
import re
!python3 -m spacy download da_core_news_lg 

In [2]:
# df is the dataframe containing (among other things) the uncleaned tweets

df = pd.read_csv('twitter_corpus.csv')

In [6]:
# We define a function that uses regular expressions to remove everything from the tweets 
# that we won't need in the actual topic modelling:


def remove_unnecessary_stuff(tweet):

    tweet = re.sub('@[^\s]+','',tweet)    # This removes all tags (where other Twitter users have been tagged)
    tweet = re.sub('http[^\s]+','',tweet) # This removes all links
    tweet = re.sub('#[^\s]+','',tweet)    # This removes all hashtags
    tweet = re.sub(r'[^\w\s]', '', tweet) # This removes everything that's not a letter or a number
    tweet = re.sub(' +',' ', tweet)       # This converts all cases of more than 1 whitespace into just 1 whitespace
    tweet = tweet.strip()                 # This deletes all whitespaces at the beginning and end of tweets
    
    return tweet  


# We define a function that lemmatizes all words in all tweets. 
# To do so, we need to first load the Danish lemmatization module "da_core_news_lg":

lemmas = spacy.load("da_core_news_lg")

def lemmatizer(tweet):
    tweet = lemmas(tweet)
  
    return " ".join([word.lemma_ for word in tweet]) 



# We now apply the function that removes all unnecessary stuff to the dataframe column containing the tweets:

df['tweet'] = df['tweet'].apply(remove_unnecessary_stuff)


# Then we'll apply the lemmatizing function to the tweets:

df['tweet'] = df['tweet'].apply(lemmatizer)


# Finally, we'll apply a function to our tweets which will remove all Danish stopwords from the tweets and make
# all tweets lowercase. To do so, we first need to import a set of Danish stopwords from the spaCy library:

from spacy.lang.da.stop_words import STOP_WORDS

df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in STOP_WORDS]).lower())

In [8]:
# This will save the new, cleaned dataset as a csv file

df.to_csv('cleaned_twitter_corpus_lemmatized_without_stopwords.csv')

# 3. Creating a pipeline

In [None]:
# We install and import the necessary libraries needed to make our pipeline

%pip install tweetopic
%pip install pandas
%pip install numpy

from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

In [98]:
# We make a vectorizer to turn our corpus of tweets into a doc-term matrix
# min_df=5 means that words that appear in less than 5 tweets will be ignored
# max_df=0.9 means that words that appear in more than 90 % of the tweets will be ignored

from tweetopic import DMM
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=5, max_df=0.5)


# We specify that we want a topic model with 6 components (topics):

dmm = DMM(n_components=6, alpha=0.1, beta=0.1)

In [99]:
# Here we create the actual topic pipeline

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("dmm", dmm),
])

In [68]:
# We load the cleaned twitter corpus and save it in the variable "texts"

df=pd.read_csv('cleaned_twitter_corpus_lemmatized_without_stopwords.csv')

In [69]:
# Here we first convert the dataframe column with the tweets into a list, 
# then we convert the list into a NumPy array. This is necessary because the pipeline will
# not accept a list or a dataframe column as input:

tweets=np.array(list(df['tweet']))

In [None]:
# Now we can run the actual pipeline on our cleaned tweets:

pipeline.fit(tweets)

# 4. Visualization of topic modelling


In [None]:
# In order to be able to visualize our topic modelling via topic-wizard, we must
# install and import the newest version (1.10.1) of the SciPy library:

!pip install scipy==1.10.1
import scipy

In [None]:
# We install and import the topic-wizard library

%pip install topic-wizard
import topicwizard

In [None]:
# Finally, we're now able to visualize our topic modelling:

topicwizard.visualize(pipeline=pipeline, corpus=tweets,port=1131)


# Notice that this code should be run in Anaconda as the visualization 
# will likely not work when the code is run in Ucloud or Google Colab.