In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy

import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

df = pd.read_csv('data/companies.csv')

df

In [None]:
url = "https://dk.trustpilot.com/review/www.kmd.dk"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")


header = soup.find("h2", {"typography_heading-s__f7029 typography_appearance-default__AAY17"})
text = header.text


body = soup.find("p", {"typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn"})
bodyText = body.text

print(text)
print(bodyText)

In [None]:
url = "https://dk.trustpilot.com/review/www.kmd.dk"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

review_divs = soup.find_all("div", {"class": "styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ"})

reviews = []

for div in review_divs:
    review_title = div.find("h2", {"class": "review-title"})
    
    # Find the first p tag within the div
    review_paragraph = div.find("p")
    reviews.append(review_paragraph.text)
    
print(reviews[1])

In [None]:
def proprocess(text):
    processed_text = ""
    return processed_text
    

## Preprocessing

### Tokenization

In [None]:
from nltk.corpus import stopwords


nlp = spacy.load("da_core_news_sm")

stop_words = set(stopwords.words('danish'))

#Tokenization and removing the stop words from every review in the loop
for i in reviews:
    tokens = word_tokenize(i)
    doc = nlp(i)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

In [None]:
for review in reviews: 
    doc = nlp(review)

    print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

    # Get nouns
    print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

    # Find named entities, phrases and concepts
    for entity in doc.ents:
        print(entity.text, entity.label_)

### Lemmatization

In [None]:
for review in reviews: 
    doc = nlp(review)
    #Lemmatization to get the root meaning of the word.

    # Lemmatize the tokens
    lemmas = [token.lemma_ for token in doc if not token.is_stop]

    tokens_lower = [token.lower() for token in tokens]
    lemmas_lower = [lemma.lower() for lemma in lemmas]

    print(tokens_lower)
    print("\n")
    print(lemmas_lower)

### Model?

In [None]:
import spacy
from spacy.lang.da.stop_words import STOP_WORDS
import csv

# Load the Danish language model
nlp = spacy.load("da_core_news_sm")

# Load the DSL lexicon
dsl_lexicon = {}
with open('data/2_headword_headword_polarity.csv', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        lemma, polarity = row[0], float(row[4])
        dsl_lexicon[lemma] = polarity

# Define a function to classify a review as positive, negative or neutral
def classify_review(review):
    # Perform segmentation
    doc = nlp(review)
    
    # Tokenize and remove stop words
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop]
    print(tokens)
    
    # Calculate the sentiment score of the review
    sentiment_score = sum([float(dsl_lexicon.get(token, 0)) for token in tokens])
    print(sentiment_score)
    
    # Classify the review based on the sentiment score
    if sentiment_score > 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"
    else:
        return "neutral"

# Test the function with a list of sample reviews
for review in reviews:
    classification = classify_review(review)
    print(classification)
    print(f'{review} \n ')
    


In [None]:
def check_word_polarity():
    # Load the DSL lexicon from the CSV file
    dsl_lexicon = {}
    with open('data/2_headword_headword_polarity.csv', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            lemma, polarity = row[0], float(row[4])
            dsl_lexicon[lemma] = polarity

    # Test the lexicon by printing the polarity score of a few words
    print(dsl_lexicon.get("løn", 0))
    print(dsl_lexicon.get("stjerne", 0))
    
check_word_polarity()