## Preprocess the new MEMO following the exact same pre-processing steps

Load Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

#essentials
import pandas as pd

#for cleaning and text-preprocessing
import nltk
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Read in the data

In [None]:
#read in the csv file
df = pd.read_csv('/Users/carljohanson/Desktop/Speciale - Code Project/Code/data/InfoDesk/daily_MEMO_data', delimiter='\t') #adjust path to your file

#rename columns
df.columns = ['column1', 'column2', 'column3']

#concatenate the three columns into a new column with all data
df['description'] = df['column1'] + ' ' + df['column2'] + ' ' + df['column3']

#remove the three columns after concatenation
df = df[df.columns[~df.columns.isin(['column1', 'column2', 'column3'])]]

#get shape of dataframe
df.shape

Corpus cleaning

In [None]:
#define a regular expression pattern to match the XML tags and attributes
pattern = re.compile(r'<.*?>')

#remove missing values
df.dropna(subset=['description'], inplace=True)

#apply the regular expression pattern to remove the tags and attributes from the text
df['description'] = df['description'].apply(lambda x: re.sub(pattern, ' ', x))

#remove duplicates
df = df.drop_duplicates()

print(df.head(20))

Corpus cleaning & preprocessing

In [None]:
#clean the text

#remove certain words from the description column. Add more at will if necessary
df['description'] = df['description'].str.replace('PRNewswire', '').str.replace('NASDAQ', '').str.replace('draft', '').str.replace('EINPresswire', '').str.replace('ResearchAndMarkets', '').str.replace('Inc','').str.replace('please','').str.replace('Inc','')

def preprocess_text(text):

    #remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)

    #remove numbers
    text = re.sub(r'\d+', '', text)

    #remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    #convert to lowercase
    text = text.lower()

    #remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text

#apply the preprocess_text function to the text column of the dataframe
df['description'] = df['description'].apply(preprocess_text)

#remove duplicates for king knut
df = df.drop_duplicates()

#remove missing values for king knut
df.dropna(subset=['description'], inplace=True)

#print the first 20 rows
df.head(20)

In [None]:
#remove rows/documents with less than 5 words
df = df[df['description'].apply(lambda x: len(x.split()) >= 5)]

#print the first 20 rows
print(df.head(20))

Translate to English for uniformity

In [None]:
from langdetect import detect, DetectorFactory
from deep_translator import GoogleTranslator

#make language detection deterministic
DetectorFactory.seed = 0

#define language detection function
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

#define fallback language
fallback_language = 'en'

#define placeholder string
placeholder = 'XXXXX'

#detect language of 'description' column and translate text to English if necessary
df['language'] = df['description'].apply(detect_language)
df['description_no_nn'] = df['description'].str.replace('novo nordisk', placeholder)
df['translated_text'] = ''

for index, row in df.iterrows():
    # Replace 'zh-cn' with 'chinese (simplified)'
    lang = 'chinese (simplified)' if row['language'] == 'zh-cn' else row['language']

    if lang != 'en' and lang != 'unknown':
        try:
            translation = GoogleTranslator(source=lang, target='en').translate(row['description_no_nn'])
            translation = translation.replace(placeholder, 'novo nordisk')
        except Exception as e:
            print(f"Translation failed for index {index} with source language {lang}. Error: {e}")
            continue
        df.at[index, 'translated_text'] = translation
    else:
        df.at[index, 'translated_text'] = row['description_no_nn'].replace(placeholder, 'novo nordisk')

#print the first 20 rows
df.head(20)

In [None]:
#remove duplicates for king knut
df = df.drop_duplicates()

#remove missing values for king knut
# Replace 'unknown' values with NaN
df = df.replace('unknown', np.nan)

# Drop rows with NaN values
df = df.dropna()

#print the first 20 rows
df.head(20)

In [None]:
df.shape

In [None]:
sentences = df['translated_text']

all_words = "".join(str(i) for i in sentences)

# Import the wordcloud library
from wordcloud import WordCloud

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(all_words)

# Visualize the word cloud
wordcloud.to_image()

Text preprocessing & tokenization

In [None]:
import string

#tokenization of the clean and translated text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#defining a function to preprocess the text
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and "'" not in token and token not in string.punctuation]

    return tokens

#apply the function to the translated text
df['tokens_text'] = df['translated_text'].apply(preprocess_text)

#show the tokens
df['tokens_text'].head(50)

Text preprocessing & tokenization with bigrams and trigrams

In [None]:
from gensim.models import phrases
from gensim.models.phrases import Phrases, Phraser

#text with bigrams, trigrams
# English
connector_words = phrases.ENGLISH_CONNECTOR_WORDS

# Detect bigrams
bigram = Phrases(df['tokens_text'], min_count=2, threshold=10, connector_words=connector_words)
bigram_phraser = Phraser(bigram)

# Detect trigrams
trigram = Phrases(bigram_phraser[df['tokens_text']], min_count=2, threshold=10, connector_words=connector_words)
trigram_phraser = Phraser(trigram)

#defining a function to preprocess the text with bigrams and trigrams
def preprocess_text_bigram_trigram(tokens):

    # Detect and add bigrams and trigrams
    tokens = bigram_phraser[tokens]
    tokens = trigram_phraser[tokens]

    return tokens

#apply the function to the translated text
df['preprocessed_text'] = df['tokens_text'].apply(preprocess_text_bigram_trigram)

#show the tokens
df['preprocessed_text'].head(50)

In [None]:
#remove rows with empty lists
df = df[df.astype(str)['preprocessed_text'] != '[]']

#remove rows with less than 5 words
df = df[df['preprocessed_text'].map(len) > 5]

#remove tokens with less than 3 characters
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: [item for item in x if len(item) > 3])

#show the shape of the dataset
df.shape

Load the LDA Model

In [None]:
from gensim.corpora import Dictionary
from gensim import models

temp_file = '/Users/carljohanson/Desktop/Speciale - Code Project/Code/Models/lda_model'
lda_model = models.ldamodel.LdaModel.load(temp_file)

Create a new corpus with the same dictionary and update the LDA model, and pass the new corpus through the model

In [None]:
#create a list of list to mirror the structure of the preprocessed text data
doc_list = df['preprocessed_text'].tolist()

#load the dictionary
dictionary = Dictionary.load('/Users/carljohanson/Desktop/Speciale - Code Project/Code/Models/lda_model.id2word')

#convert the preprocessed text data into a bag-of-words representation using the loaded dictionary
corpus = [dictionary.doc2bow(doc) for doc in doc_list]

#use the lda model to transform into a bag-of-words representation with topic distribution
lda_corpus = lda_model[corpus]

#update the lda model with the new corpus
lda_model.update(corpus, passes=10, iterations=100)

Evaluate the LDA model

In [None]:
#evaluate using coherence score and perplexity
from gensim.models import CoherenceModel

#compute Coherence Score
cm = CoherenceModel(model=lda_model, corpus=lda_corpus, texts=doc_list, dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)

#compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  #The lower, the better.

Predict the dominant topic for each document and prepare the data for ML classifier

In [None]:
#loop through each document in the corpus and get its topic distribution
topic_distributions = []
for i, doc in enumerate(corpus):
    topic_dist = lda_model.get_document_topics(doc, minimum_probability=0.0)
    topic_distributions.append(topic_dist)

#extract the dominant topic for each document and add it as a new column in the dataframe
df['dominant_topic'] = [max(topic_dist, key=lambda x:x[1])[0] for topic_dist in topic_distributions]

In [None]:
#export topic proportions and labels to ML classifier

#compute the topic proportions for each document in the corpus
topic_proportions = []
for doc in corpus:
    topic_vector = lda_model.get_document_topics(doc, minimum_probability=0.0)
    proportions = [topic_prob[1] for topic_prob in topic_vector]
    topic_proportions.append(proportions)

#add topic proportions to the dataframe
df['topic_proportions'] = topic_proportions

In [None]:
df['topic_proportions'].head(50)

In [None]:
import joblib

#create a numpy array of the topic proportions
X = np.array(df['topic_proportions'].tolist())

#load the random forest model
rf = joblib.load('/Users/carljohanson/Desktop/Speciale - Code Project/Code/Models/rf_model')

#predict the topic for each document
predict_topic = rf.predict(X)

#display the predicted topic
predict_topic

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

#extract feature vectors and labels from the DataFrame
X = np.array(df['topic_proportions'].tolist())  #features (document-topic distributions)
y = df['dominant_topic'].values  #labels (categories/classes)

#split the dataset 80:20 for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#predict the test set
y_pred = rf.predict(X_test)

#evaluate the model
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#convert to a picture
plt.figure(figsize=(10, 10))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt=".0f", linewidths=.5, square=True, cmap='Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred))
plt.title(all_sample_title, size=15);

Create dataframe with the predicted topic and export to csv file

In [None]:
#add the predicted topic to the dataframe
df['topic'] = predict_topic

final_df = df[['description','topic']].head(50)

In [None]:
#export the dataframe to a csv file
final_df.to_csv('/Users/carljohanson/Desktop/Speciale - Code Project/Code/Results/prepared_memo.csv', index=False)

In [None]:
final_df