In [1]:
import spacy
import re
import pandas as pd
import html


nlp_en = spacy.load("en_core_web_sm")  # For my English model
nlp_nl = spacy.load("nl_core_news_sm")  # For my Dutch model

# Making a function for text preprocessing
def preprocess_text(text, nlp_model):
    if not isinstance(text, str):  # Handle non-string values
        return ""
    
    # Unescape HTML entities
    text = html.unescape(text)
    
    # Remove HTML fragments like <a href> tags
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    
    # Tokenize and lemmatize using spaCy
    doc = nlp_model(text.lower())  # Lowercase before spaCy processing
    tokens = [
        token.lemma_ for token in doc
        if not token.is_punct and not token.is_space
    ]  # Lemmatize and exclude punctuation/whitespace
    
    # Join tokens into a single string
    cleaned_text = " ".join(tokens)
    
    # Removing any extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

# Load and process Dutch comments
df_dutch = pd.read_csv("dutch_comments_full.csv")
df_dutch['Cleaned Comment Text'] = df_dutch['Comment Text'].apply(lambda x: preprocess_text(x, nlp_nl))
df_dutch.to_csv("dutch_comments_cleaned.csv", index=False)
print(df_dutch.head())

# Load and process English comments
df_english = pd.read_csv("english_comments_full.csv")
df_english['Cleaned Comment Text'] = df_english['Comment Text'].apply(lambda x: preprocess_text(x, nlp_en))
df_english.to_csv("english_comments_cleaned.csv", index=False)
print("English comments cleaned and saved to 'english_comments_cleaned.csv'.")
print(df_english.head())



       Car Model Language     Video ID         Author          Published At  \
0  Tesla Model Y    Dutch  E5mfhe-Q6lE  @BartHuitsing  2023-06-26T11:38:09Z   
1  Tesla Model Y    Dutch  E5mfhe-Q6lE  @werner134897  2023-03-11T06:08:18Z   
2  Tesla Model Y    Dutch  E5mfhe-Q6lE    @ronnie9187  2023-01-17T15:07:13Z   
3  Tesla Model Y    Dutch  E5mfhe-Q6lE      @jote2275  2022-11-06T15:39:27Z   
4  Tesla Model Y    Dutch  E5mfhe-Q6lE      @Maszzmic  2022-01-16T20:44:26Z   

             Updated At  Like Count  \
0  2023-06-26T11:38:09Z           1   
1  2023-03-11T06:10:00Z           0   
2  2023-01-17T15:07:13Z           0   
3  2022-11-06T15:39:27Z           0   
4  2022-01-16T20:44:26Z           0   

                                        Comment Text  \
0  Eén van de belangrijkste voordelen van de Y in...   
1  Nadruk op hogere prijs en gewicht is mijns inz...   
2  Het valt me op dat er lakschade aan de voorkan...   
3  Electrische auto&#39;s zijn enorm groot en duu...   
4  <a href