# Text cleaning

In [2]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk

# Importing DR and Berlingske texts and merging them into one

In [32]:
data_DR = pd.read_csv('DR_full_data.csv')
data_DR['source'] = 'dr.dk' # adding a source variable

data_B = pd.read_csv('Berlingske_full_data.csv')
data_B['source'] = 'berlingske.dk' # adding a source variable

In [33]:
 # Making sure we only keep relevant columns and giving them the same names:
data_B = data_B.rename(columns={'Title': 'title', 'Date': 'date', 'Content':'article_text', 'URL':'url'})

data_DR = data_DR.rename(columns = {'Title': 'title', 'Date': 'date', 'Content':'article_text', 'URL':'url'})

**Converting the dates to be in the same format:**


*Berlingske*

In [34]:
# Converting Berlingske: 
import pandas as pd

# Step 1: Extract the day, month name, and year parts using regex and process in place
data_B[['day', 'month', 'year']] = data_B['date'].str.extract(r'd. (\d{2})\. (\w+) (\d{4})')

# Step 2: Convert the month names to month numbers (Danish to English)
month_mapping = {
    'januar': '01', 'februar': '02', 'marts': '03', 'april': '04',
    'maj': '05', 'juni': '06', 'juli': '07', 'august': '08',
    'september': '09', 'oktober': '10', 'november': '11', 'december': '12'
}
data_B['month'] = data_B['month'].map(month_mapping)

# Step 3: Combine the cleaned date information and update the 'date' column
data_B['date'] = data_B['year'] + '-' + data_B['month'] + '-' + data_B['day']

# Step 4: Convert the 'date' column to datetime format
data_B['date'] = pd.to_datetime(data_B['date'], format='%Y-%m-%d')

# Drop the extra columns used for processing ('day', 'month', 'year')
data_B.drop(columns=['day', 'month', 'year'], inplace=True)

In [35]:
# Making it datetime: 
data_B['date'] = pd.to_datetime(data_B['date'], errors='coerce')

*DR*

In [36]:
# DR also has the time included, which confuses when trying to convert to datetime format, so we will remove those first:
data_DR['date'] = data_DR['date'].str.replace(r'T.*', '', regex=True)
data_DR['date'] = pd.to_datetime(data_DR['date'], errors='coerce')

In [54]:
# Removing NAs:
data_DR = data_DR.dropna()
data_B = data_B.dropna()

In [55]:
# Merging them into one:
data = pd.concat([data_DR, data_B], axis=0)

In [56]:
# Sort the DataFrame by the 'article_date' column from oldest to newest
data = data.sort_values(by='date')

In [57]:
# Saving as a complete CSV: 
data.to_csv('DK_full_data.csv', index=False)

# Cleaning

In [60]:
# Importing the full DK data file
data = pd.read_csv('DK_full_data.csv')

In [61]:
data

Unnamed: 0,url,article_text,date,source,title
0,https://www.berlingske.dk/business/mobilgigant...,Lykketræf: Den tyske industrigigant Siemens er...,2002-07-14,berlingske.dk,Mobilgiganten fra München
1,https://www.dr.dk/nyheder/penge/farvel-til-kfx...,"I dag må investorer, der er interesserede i de...",2005-10-03,dr.dk,
2,https://www.dr.dk/nyheder/penge/fald-i-omxc20-...,"Det var en sur sidste dag i ugen, hvor markede...",2005-10-21,dr.dk,
3,https://www.dr.dk/nyheder/penge/novo-var-lyset...,I dag var igen en forholdsvis hektisk dag på d...,2005-10-27,dr.dk,
4,https://www.dr.dk/nyheder/penge/tyve-stoerste-...,Der hersker mandag morgen optimisme på de euro...,2005-10-31,dr.dk,
...,...,...,...,...,...
12941,https://www.berlingske.dk/aktier/en-tastefejl-...,Flere og flere handler på aktiemarkedet bliver...,2024-08-12,berlingske.dk,En tastefejl udløste drastisk kursfald. Aktiem...
12942,https://www.berlingske.dk/business/business-ov...,"Her er de vigtigste Business-nyheder, som du s...",2024-08-13,berlingske.dk,Business-overblik: Sidste uges aktiepanik er f...
12943,https://www.berlingske.dk/oekonomi/groenne-tal...,"Aktiemarkedet kom skidt fra start i august, hv...",2024-08-15,berlingske.dk,Grønne tal og optimisme dominerer aktierne. Ek...
12944,https://www.berlingske.dk/business/bavarian-no...,Data fra et forsøg viser ifølge Bavarian Nordi...,2024-08-16,berlingske.dk,Bavarian Nordic vil have mpox-vaccine godkendt...


## What is your document?

Our document is the article as a whole - all text in the article excluding image discriptions, and authour tags. 

## Preprossesing
- Clean text: ignore/remove any unwanted characters: casing, HTML markup, non-words, etc. (maybe also emoticons?)
- Tokenization and stop-words
- Stemming and lemmatization

***Removing NA values*** on the column "article_text"

In [62]:
# Remove rows with NaN values in the 'article_text' column
data = data.dropna(subset=['article_text'])
data = data.reset_index(drop=True)  # Reset the index

***Cleaning the text***

In [63]:
def cleaner(document):
    document = document.lower()  # To lower case
    document = re.sub(r'<[^>]*>', ' ', document)  # Remove HTML
    document = re.sub(r'[^\w\s&$€%]', '', document)  # Remove non-alphanumeric characters except &, $, %, and €
    return document

data['article_text'] = data['article_text'].apply(cleaner)

# Checking for duplicates: 
data[data['article_text'].duplicated()]

# Removing the duplicates articles: 
data = data.drop_duplicates(subset=['article_text']).reset_index(drop=True)

***Tokenization***
- Splitting the articles into meaningfull elements to prepare for analysis. In our case we need to split the articles into words as these are what will be used for classifying sentiments. 

In [64]:
# Split the 'article_text' column into tokens based on whitespace and saving it into a new column
# "words"
data['words'] = data['article_text'].str.split()

***Removing stop words***
- These are words that occur very often and probably bear no useful information about the text

In [65]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = nltk.corpus.stopwords.words('danish')
data['words'] = [i for i in data['words'] if i not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/astakettel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Saving as a CSV file***

In [66]:
data.to_csv('DK_cleaned.csv', index=False)

# Using the lexicon vader for sentiment scores

We need to adjust the VADER sentiment analysis technique for Danish. The following is a step by step guide that i can also use to describe the process in the final paper. I start by downloading the VADER lexicon:

In [68]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/astakettel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Step 2: Prepare the Danish Sentiment Lexicon
Then i want to convert the Danish sentimenr lexicon to a format that is compatible with VADER: The VADEr lexicon is a dictionary where each word is associated with a sentiment score. 

I have downloaded te Danish sentiment lexicon as a CSV file from the github repository; https://github.com/dsldk/danish-sentiment-lexicon
We need to clean the Danish sentiment lexicon, to ensure that both the base words and their various forms are included in the lexicon

In [75]:
import pandas as pd 
#Loading 
danish_lexicon_df = pd.read_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/Danish sentiments/2_headword_headword_polarity.csv', 
                               header = None)
# renaming the columns for clarity:
danish_lexicon_df.columns = ['word', 'column_2', 'part_of_speech', 'code', 'polarity', 'forms']

# Convert the polirarity column to numeric type: 
danish_lexicon_df['polarity'] = pd.to_numeric(danish_lexicon_df['polarity'], errors='coerce')

# Drop rows with missing or invalid data in 'word' or 'polarity'
lexicon_cleaned = danish_lexicon_df.dropna(subset=['word', 'polarity'])

#### Rescaling the Danish lexicon to match the VADER scores

The original scale of the sentiments in the Danish lexicon is: = "-5" (maximum degree negative)|"-4" (very high degree negative)|"-3" (high degree negative) |"-2" (degree negative) | "-1" (low degree negative) | "1" (low degree positive)| "2" (degree positive)|"3" (high degree positive)|"4" (very high degree positive)| "5" (maximum degree positive)

Therefore we need to rescalethe Danish polarity to VADER scale: 

In [85]:
# Function to rescale Danish polarity to VADER scale
def rescale_polarity(danish_polarity):
    # Rescale from -5 to +5 range to -4 to +4 range
    vader_polarity = ((danish_polarity - (-5)) / (5 - (-5))) * (4 - (-4)) + (-4)
    return vader_polarity

Then we make a dictonary to store all words - both base form and other forms of each word - and their associated poliarity scores- The scores are rescaled to match the VADEr scores, using the function we just defined

In [86]:
# Initialize an empty dictionary to hold all words and their respective rescaled sentiment scores
danish_lexicon = {}

# Include all word forms and base words in the lexicon
for index, row in lexicon_cleaned.iterrows():
    # Rescale the Danish polarity to match VADER's scale
    vader_polarity = rescale_polarity(row['polarity'])
    
    # Add the base word with its rescaled polarity
    danish_lexicon[row['word']] = vader_polarity
    
    # Add all conjugated forms of the word to the lexicon, if available
    if pd.notna(row['forms']):
        # Split the forms by semicolon, assuming they are separated this way
        forms = row['forms'].split(';')
        for form in forms:
            # Ensure all forms have the same rescaled sentiment score as the base word
            danish_lexicon[form] = vader_polarity

# Now, danish_lexicon contains the headword and all its forms with rescaled polarity scores

### Step 3 Modify VADER with the Danish Lexicon
Then we integrate the Danish lexicon into VADER

In [89]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyser = SentimentIntensityAnalyzer()

# Update VADER's lexicon with the Danish lexicon (including all forms and rescaled values)
analyser.lexicon.update(danish_lexicon)

### Step 4: Performing the sentiment analysis

I start bt defining a function that calculates the compound sentiment score for each row of text in a Dataframe using the polarity_scores() method. 
And then i apply this function to each article in the dataset

In [90]:
# Define a function to get the compound sentiment score
def get_sentiment(text):
    # Get the compound sentiment score
    return analyser.polarity_scores(text)['compound']

# Apply the function to the 'article_text' column
data['sentiment'] = data['article_text'].apply(get_sentiment)

# Display the first few rows to verify
print(data[['article_text', 'sentiment']])

                                            article_text  sentiment
0      lykketræf den tyske industrigigant siemens er ...     0.8225
1      i dag må investorer der er interesserede i de ...     0.7783
2      det var en sur sidste dag i ugen hvor markedet...    -0.9274
3      i dag var igen en forholdsvis hektisk dag på d...     0.9906
4      der hersker mandag morgen optimisme på de euro...     0.9274
...                                                  ...        ...
12814  flere og flere handler på aktiemarkedet bliver...     0.9964
12815  her er de vigtigste businessnyheder som du ska...    -0.9042
12816  aktiemarkedet kom skidt fra start i august hvo...     0.9866
12817  data fra et forsøg viser ifølge bavarian nordi...     0.9501
12818  landets revisorhuse kæmper hårdt om at vinde d...     0.9988

[12819 rows x 2 columns]


In VADER sentiment analysis the compound score is a normalized weighted composite score which represnt the overall sentiment of a given text. It takes into account the positive, negative and neutral score calculated by VADER and combines them into a single value, which provide the overall sentiment rating.

- The compound score is calculated as a weighhted sum of the valence scores of each word in the text, normalized to fall within the -1 to +q range. VADER uses heuritics to balnce the impact of negation, punctuation (like !) and intensifiers (like "very")
- The compound score ranges from -1 to +1 

## Computing sentiment scores from -1 to 1 and binary 

In [92]:
# Create a new column using an if-else loop to classify sentiment
data['pos/neg'] = ''

# Loop through each row and assign the sentiment label
for i in range(len(data)):
    if data.loc[i, 'sentiment'] > 0:
        data.loc[i, 'pos/neg'] = 'positive'
    else:
        data.loc[i, 'pos/neg'] = 'negative'

## Saving as csv

In [95]:
data.to_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/Webscraping/DK_clean_sent.csv', index=False)