## Preprocessing of the InfoDesk dataset

Import Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
# set the display options to show all the columns and rows of the dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

Read the dataset, separate by tab and then concatenate into a dataframe

In [None]:
# read in the csv file
df = pd.read_csv('/Users/carljohanson/Desktop/Speciale - Code Project/data/final_MEMO_data.csv', delimiter='\t')

df.columns = ['column1', 'column2', 'column3']

# concatenate the three columns into a new column with all data
df['description'] = df.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

# drop the original columns
df.drop(columns=['column1', 'column2', 'column3'], inplace=True)

df

In [None]:
# concatenate all the text strings in the 'description' column
text = ' '.join(df['description'].astype(str).tolist())

# create a WordCloud object
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(text)

# plot the WordCloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Clean the text: remove all non-relevant information, punctuations, missing values etc.

In [None]:
# define a regular expression pattern to match the XML tags and attributes
pattern = re.compile(r'<.*?>')

# apply the regular expression pattern to remove the tags and attributes from the text
df['description'] = df['description'].apply(lambda x: re.sub(pattern, ' ', x))

#remove missing values
df.dropna(subset=['description'], inplace=True)

#remove duplicates
df = df.drop_duplicates()

print(df.head(20))

In [None]:
df.shape

In [None]:
# concatenate all the text strings in the 'description' column
text = ' '.join(df['description'].astype(str).tolist())

# create a WordCloud object
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(text)

# plot the WordCloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#clean the text

#remove certain words from the description column
df['description'] = df['description'].str.replace('PRNewswire', '').str.replace('NASDAQ', '').str.replace('draft', '').str.replace('EINPresswire', '').str.replace('ResearchAndMarkets', '').str.replace('Inc','').str.replace('please','').str.replace('Inc','')

def preprocess_text(text):
    
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text

# apply the preprocess_text function to the text column of the dataframe
df['description'] = df['description'].apply(preprocess_text)

#remove duplicates for king knut
df = df.drop_duplicates()

#remove missing values for king knut
df.dropna(subset=['description'], inplace=True)

df.head(20)

In [None]:
df.shape

In [None]:
# concatenate all the text strings in the 'description' column
text = ' '.join(df['description'].astype(str).tolist())

# create a WordCloud object
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(text)

# plot the WordCloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#remove rows/documents with less than 5 words
df = df[df['description'].apply(lambda x: len(x.split()) >= 5)]

print(df.head(20))

In [None]:
df.shape

Translate the text to English

In [None]:
from langdetect import detect, DetectorFactory
from deep_translator import GoogleTranslator

# Make language detection deterministic
DetectorFactory.seed = 0

# Define language detection function
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

# Define fallback language
fallback_language = 'en'

# Define placeholder string
placeholder = 'XXXXX'

# Detect language of 'description' column and translate text to English if necessary
df['language'] = df['description'].apply(detect_language)
df['description_no_nn'] = df['description'].str.replace('novo nordisk', placeholder)
df['translated_text'] = ''

for index, row in df.iterrows():
    # Replace 'zh-cn' with 'chinese (simplified)'
    lang = 'chinese (simplified)' if row['language'] == 'zh-cn' else row['language']

    if lang != 'en' and lang != 'unknown':
        try:
            translation = GoogleTranslator(source=lang, target='en').translate(row['description_no_nn'])
            translation = translation.replace(placeholder, 'novo nordisk')
        except Exception as e:
            print(f"Translation failed for index {index} with source language {lang}. Error: {e}")
            continue
        df.at[index, 'translated_text'] = translation
    else:
        df.at[index, 'translated_text'] = row['description_no_nn'].replace(placeholder, 'novo nordisk')

In [None]:
#remove duplicates for king knut
df = df.drop_duplicates()

#remove missing values for king knut
# Replace 'unknown' values with NaN
df = df.replace('unknown', np.nan)

# Drop rows with NaN values
df = df.dropna()

df.head(20)

In [None]:
df.shape

In [None]:
#return the values of column language
df['language'].value_counts()

In [None]:
#detect language in the translated_text column
df['language_after'] = df['translated_text'].apply(detect_language)
df['language_after'].value_counts()

In [None]:
# Filter out non-English rows
df = df[df['language'] == 'en']

df['language_after'].value_counts()

In [None]:
df.shape

In [None]:
sentences = df['translated_text']

all_words = "".join(str(i) for i in sentences)

# Import the wordcloud library
from wordcloud import WordCloud

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(all_words)

# Visualize the word cloud
wordcloud.to_image()

Export the data

In [None]:
df.to_csv("translated_text.csv")