## Extract keywords and separate tokens

In [None]:
import os
from multi_rake import Rake
import pandas as pd
import time
import lemmy

os.chdir("/folder")

csv_input_file_path = './input.csv'
csv_output_file_path_pre_lemmy_ddmmyy_to_ddmmyy = "./output_temp.csv"
csv_input_file_path_pre_lemmy_ddmmyy_to_ddmmyy = csv_output_file_path_pre_lemmy_ddmmyy_to_ddmmyy 
csv_output_file_path_post_lemmy_ddmmyy_to_ddmmyy = "./output.csv" 

for file in [csv_input_file_path]:  
    print(f"Processing file: {file}")
    start_time = time.time()

    rake = Rake(
        min_chars=3, 
        max_words=2,  
        min_freq=1,  
        language_code=None,
        stopwords=None,  
        lang_detect_threshold=50,  
        max_words_unknown_lang=2,  
        generated_stopwords_percentile=80,  
        generated_stopwords_max_len=3,  
        generated_stopwords_min_freq=2,  
    )

    df = pd.read_csv(csv_input_file_path, encoding="utf-8", sep = ";")
    output_data = []
    for _, row in df.iterrows():
        article_id = row.iloc[0]  
        article_text = str(row.iloc[6]) 
        try:
            keywords = rake.apply(article_text)  
        except Exception as e:
            if "invalid UTF-8" in str(e):  
                print(f"Skipping row {article_id} due to UTF-8 encoding error: {e}")
                continue  
            else:
                raise  

        top_keywords = [kw[0] for kw in keywords[:20]] 
        top_keywords = top_keywords + [""] * (20 - len(top_keywords))
        individual_words = set()
        for kw in top_keywords:
            if " " in kw: 
                words = kw.split()
                individual_words.update(words)
        individual_words = list(individual_words)[:40]  
        individual_words += [""] * (40 - len(individual_words))
        output_data.append([article_id] + top_keywords + individual_words + [article_text])
    keyword_columns = [f"Keyword_{i+1}" for i in range(20)]
    word_columns = [f"Word_{i+1}" for i in range(40)]
    output_df = pd.DataFrame(output_data, columns=["ID"] + keyword_columns + word_columns + ["Article_text"])
    output_df.to_csv(csv_output_file_path_pre_lemmy_ddmmyy_to_ddmmyy, mode='w', sep=";", index=False)
    print(f"Finalised extracting keywords in {df.shape[0]} out of {df.shape[0]} articles")

    lemmatizer = lemmy.load("da")

    df = pd.read_csv(csv_input_file_path_pre_lemmy_ddmmyy_to_ddmmyy, delimiter=";", dtype=str)
    def lemmatize_phrase(phrase):
        if pd.notna(phrase):
            words = phrase.strip('"').split() 
            lemmatized_words = [lemmatizer.lemmatize("", word)[0] for word in words]  
            return " ".join(lemmatized_words) 
        return phrase
    for col in df.columns[1:]:  
        if col != "Article_text":  
            df[col] = df[col].apply(lemmatize_phrase)
    df.to_csv(csv_output_file_path_post_lemmy_ddmmyy_to_ddmmyy, sep=";", index=False)
    print(f"Finalised lemmatizing {df.shape[0]} out of {df.shape[0]} rows")

    end_time = time.time() 
    runtime = end_time - start_time  

print(f"Runtime: {runtime:.2f} seconds")


## Check for target words

In [None]:
import pandas as pd
import time
import re
import os

targets = {
    "rusland": None,
    "putin": None,
    "kina": None,
    "jinping": None,
    "klimaforandring": None,
    "klima": None,
    "indvandrer": None,
    "indvandring": None,
    "udlænding": None,
    "asylansøger": None,
    "muslim": None,
    "islam": None,
    "mette frederiksen": None,
    "lars løkke": None,
    "statsminister": None,
    "enhedslisten": None,
    "sf": None,
    "radikale venstre": None,
    "socialdemo": None,
    "venstre": None,
    "konservativ": None,
    "liberal alliance": None,
    "dansk folkeparti": None,
    "danmarksdemokrat": None,
    "moderaterne": None, 
    "ukrain" : None,
}

os.chdir('/folder')
input_file_path = './input.csv'  
output_file_path = './output.csv'    

print(f"Processing file: {input_file_path}")
start_time = time.time()

def target_in_text(text):
    check_ = {}
    text_lower = text.lower()
    text_clean = re.sub(r"[^\w\s]", " ", text_lower)

    tokens = text_clean.split()

    for target in targets:
        if target == "venstre":
            match = 0
            for i, token in enumerate(tokens):
                if token == "venstre":
                    if i == 0 or tokens[i - 1] != "radikale":
                        match = 1
                        break
            check_[target] = match

        elif target == "sf":
            pattern = r'\bsf\b'
            check_[target] = 1 if re.search(pattern, text_clean) else 0

        else:
            check_[target] = 1 if target in text_clean else 0

    return check_

df = pd.read_csv(input_file_path, sep=";", quotechar='"')
output_data = []

for idx, row in df.iterrows():
    article_id = row.iloc[0]
    article_text = str(row.iloc[6])
    check_results = target_in_text(article_text)
    output_data.append([article_id] + list(check_results.values()))

    if idx % 100 == 0: 
        print(f"Processed {idx+1} / {df.shape[0]} rows")

output_df = pd.DataFrame(output_data, columns=["ID"] + [f"check_{t}" for t in targets])
output_df.to_csv(output_file_path, sep=";", index=False)

end_time = time.time()
runtime = end_time - start_time
print(f"Runtime: {runtime:.2f} seconds")
print(f"Output saved to {output_file_path}")


## sentiment analysis

In [None]:
import pandas as pd
import time
import os
from sentida import Sentida

os.chdir("/folder")
input_file_path = './input.csv' 
output_file_path = './output.csv' 

print(f"Processing file: {input_file_path}")
start_time = time.time()

SV = Sentida()

def sentiment_of_text(article_text): 
    if not isinstance(article_text, str) or article_text.strip() == "" or pd.isna(article_text):
        return 0  

    try:
        score = SV.sentida(text=article_text, output="mean", normal=True)
    except Exception as e:
        print(f"Error processing text: {article_text[:50]}... - {e}")
        return 0  

    return score

df = pd.read_csv(input_file_path, sep=",")
output_data = []

with open(output_file_path, 'w') as output_file:
    output_file.write("ID;Score\n")
    
    for idx, row in df.iterrows():
        article_id = row.iloc[0]
        article_text = str(row.iloc[6]) 
        article_sentiment = sentiment_of_text(article_text)

        output_file.write(f"{article_id};{article_sentiment}\n")
        
        if idx % 100 == 0 and idx > 0:
            elapsed_time = time.time() - start_time
            time_per_row = elapsed_time / (idx + 1)
            print(f"Processed {idx + 1} rows in {elapsed_time:.2f} seconds. Time per row: {time_per_row:.4f} seconds")

    elapsed_time = time.time() - start_time
    print(f"Total time: {elapsed_time:.2f} seconds")
    print(f"Output saved to {output_file_path}")
