In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm
tqdm.pandas()

In [None]:
df = pd.read_csv('annotations_metadata.csv')

# Data collection - read text files
text_files_dir = 'all_files'
texts = []
for file_id in df['file_id']:
    file_path = os.path.join(text_files_dir, f"{file_id}.txt")
    text_content = None
    try:
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            text_content = file.read()
    except Exception as e:
        print(f"Error reading file {file_id}: {e}")
    texts.append(text_content)

df_with_text = df.copy()
df_with_text['text'] = texts

df_with_text.to_csv('supremacist_raw.csv', index=False, encoding='utf-8-sig')

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'(?<!\.)\.{3,}(?!\.)', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\b\d+(\.\d+)?', '', text) 
    text = re.sub(r'\b\d+(\.\d+)?', '', text) 
    text = re.sub(r'\d+', '', text) 
    text = re.sub(r'_', '', text) 
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'http', '', text)
    text = re.sub(r'https', '', text)
    text = re.sub(r'nt', '', text)
    text = re.sub(r'mkr', '', text)
    text = re.sub(r've', '', text)
    text = re.sub(r'don', '', text)
    text = re.sub(r'kat', '', text)
    text = re.sub(r'kabir', '', text)
    text = re.sub(r'singh', '', text)
    text = re.sub(r'bhai', '', text)
    #text = re.sub(r'th', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

In [None]:
df = pd.read_csv('supremacist_raw.csv')
df = df.drop(columns=['file_id', 'user_id', 'subforum_id', 'num_contexts'])

df1 = df[df['label'] == 'noHate']
df2 = df[df['label'] == 'hate']
data = pd.concat([df1, df2])
data['prepro'] = data['text'].progress_apply(preprocess_text)
data['prepro'] = data['prepro'].apply(lambda x: str(x) if pd.notnull(x) else '')
data = data[data['prepro'].apply(lambda x: len(x.split()) > 3)]
data['encoding'] = data['label'].apply(lambda x: 1 if x == 'hate' else 0)
data = data.drop(columns=['text', 'label'])
data = data.sample(frac=1, random_state=42)

data.to_csv('supremacist_input.csv', index=False, encoding='utf-8-sig')