### Data Cleaning

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np
import re

# Folder path
folder = 'raw_data'

# List all files in the folder and filter the .tsv.xz files
file = [f for f in os.listdir(folder) if f.endswith('.tsv.xz')]

# Assuming there's only one file, get the full file path
file_path = os.path.join(folder, file[0])

# Read the CSV file
df = pd.read_csv(file_path, sep='\t', compression='xz')

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()


# Function to clean text by removing HTML tags, URLs, and extra spaces
def clean_text(text):
    if pd.isna(text):  # Return empty string if value is NaN
        return ''
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # Remove URLs
    text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)  # Remove HTML entities
    text = re.sub(r'\s+', ' ', text).strip()  # Reduce multiple spaces
    return text


def remove_similar_rows(df, threshold=0.995):
    # Drop rows where the content_id is the same
    df = df.drop_duplicates(subset=['content_id'])



    # Clean the content column
    df.loc[:, 'content'] = df['content'].apply(clean_text)


    # Drop rows where the head is exactly the same
    df = df.drop_duplicates(subset=["head"])


    # Vectorize the content using TF-IDF
    vectorizer = TfidfVectorizer().fit_transform(df['content'])
    vectors = vectorizer.toarray()

    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(vectors)

    # Identify pairs of articles with similarity above the threshold
    similar_pairs = np.where(cosine_sim_matrix > threshold)

    # Create a set of indices to drop
    indices_to_drop = set()
    for i, j in zip(*similar_pairs):
        if i != j:
            indices_to_drop.add(j)

    # Drop the duplicates using .loc to avoid SettingWithCopyWarning
    df = df.loc[~df.index.isin(indices_to_drop)]

    # Reset index
    df.reset_index(drop=True, inplace=True)

    return df

# Drop same or nearly same articles
df = remove_similar_rows(df, 0.98)

# make the pubtime a df datetime format
df['pubtime'] = pd.to_datetime(df['pubtime'])

# Create 'cleaned_data' folder if it doesn't exist
output_folder = 'cleaned_data'
os.makedirs(output_folder, exist_ok=True)

# TODO aktuell zu langsam allfällige unnötige Spalten entfernen
# Save as Parquet-File in the cleaned_data folder
output_file = os.path.join(output_folder, "cleaned_data.parquet")
df.to_parquet(output_file, engine="pyarrow", index=False)


  df['pubtime'] = pd.to_datetime(df['pubtime'])
