In [3]:
import pandas as pd
import numpy as np

# Define irrelevant keywords (Dutch + English)
irrelevant_keywords = [
    "great video", "enjoyed the video", "mat", "love this channel", 
    "amazing review", "brilliant review",  
    "prima review", "leuke video", "toffe video", "leuke review", 
    "presentator", "review", "reviews", "kanaal", "channel", "watson", "reviewed", "video"    
]

# Define product-related keywords (Dutch + English)
product_keywords = [
    "buy", "drive", "price", "design", "feature", "reliable", "fast", "comfortable", 
    "performance", "engine", "battery", "range", "mileage", "interior", "handling",  
    "prijs", "rijden", "kia", "mercedes", "benz", "chinees", "duits", 
    "smaak", "kopen", "gekocht"  
]

# Function to filter comments
def filter_comments_balanced(text):
    if not isinstance(text, str):  # Handle non-string values
        return True  # Keep rows with non-string values

    # Check for irrelevant and product-related keywords
    contains_irrelevant = any(keyword in text.lower() for keyword in irrelevant_keywords)
    contains_product = any(keyword in text.lower() for keyword in product_keywords)
    
    # Remove comment only if it's irrelevant and not product-related
    return not (contains_irrelevant and not contains_product)

# Load datasets
df_dutch = pd.read_csv("dutch_comments_cleaned.csv")
df_english = pd.read_csv("english_comments_cleaned.csv")

# Apply filtering to Dutch and English datasets
df_dutch = df_dutch[df_dutch['Cleaned Comment Text'].apply(filter_comments_balanced)]
df_english = df_english[df_english['Cleaned Comment Text'].apply(filter_comments_balanced)]

# Save the filtered datasets
df_dutch.to_csv("dutch_comments_filtered_combined.csv", index=False)
df_english.to_csv("english_comments_filtered_combined.csv", index=False)

# Display the first few rows of the filtered datasets
print("Filtered Dutch comments (balanced):")
print(df_dutch.head())

print("\nFiltered English comments (balanced):")
print(df_english.head())

Filtered Dutch comments (balanced):
       Car Model Language     Video ID         Author          Published At  \
0  Tesla Model Y    Dutch  E5mfhe-Q6lE  @BartHuitsing  2023-06-26T11:38:09Z   
1  Tesla Model Y    Dutch  E5mfhe-Q6lE  @werner134897  2023-03-11T06:08:18Z   
2  Tesla Model Y    Dutch  E5mfhe-Q6lE    @ronnie9187  2023-01-17T15:07:13Z   
3  Tesla Model Y    Dutch  E5mfhe-Q6lE      @jote2275  2022-11-06T15:39:27Z   
4  Tesla Model Y    Dutch  E5mfhe-Q6lE      @Maszzmic  2022-01-16T20:44:26Z   

             Updated At  Like Count  \
0  2023-06-26T11:38:09Z           1   
1  2023-03-11T06:10:00Z           0   
2  2023-01-17T15:07:13Z           0   
3  2022-11-06T15:39:27Z           0   
4  2022-01-16T20:44:26Z           0   

                                        Comment Text  \
0  Eén van de belangrijkste voordelen van de Y in...   
1  Nadruk op hogere prijs en gewicht is mijns inz...   
2  Het valt me op dat er lakschade aan de voorkan...   
3  Electrische auto&#39;s zijn

In [5]:
df_dutch.info()
df_english.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3344 entries, 0 to 3633
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Car Model             3344 non-null   object
 1   Language              3344 non-null   object
 2   Video ID              3344 non-null   object
 3   Author                3343 non-null   object
 4   Published At          3344 non-null   object
 5   Updated At            3344 non-null   object
 6   Like Count            3344 non-null   int64 
 7   Comment Text          3344 non-null   object
 8   Cleaned Comment Text  3344 non-null   object
dtypes: int64(1), object(8)
memory usage: 261.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 47233 entries, 0 to 55605
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Car Model             47233 non-null  object
 1   Language              47233 non-null  o