In [50]:
import pandas as pd
import contractions
from torch.utils.data import Dataset, DataLoader
from google import genai
import os
from dotenv import load_dotenv
import time
import re
import csv

In [51]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'(http[s]?://\S+|www\.\S+)', '', text)

    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Split numbers and chars
    text = re.sub(r'(?<=\d)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=\d)', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [52]:
def expand(text):
    with open('abbreviations.csv', mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        abbreviations = {rows[0]: rows[1] for rows in reader}

    words = text.split()
    for i in range(len(words)):
        word = words[i].lower()
        if word in abbreviations:
            if (word != 'pm') or (word == 'pm' and i > 0 and words[i-1].isalpha()):
                words[i] = abbreviations[word]
    return ' '.join(words)

In [53]:
def preprocessing(path):
    try:
        df = pd.read_csv(path)
        output_file = 'CLAN_data_preprocessed.csv'

        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            f.write(','.join(df.columns) + '\n')

            for index, row in df.iterrows():
                # Process text
                text = contractions.fix(row['Social Media Post'])
                text = expand(text)
                text = clean_text(text)
                row['Social Media Post'] = text

                # Quote column 1 and 2
                row[df.columns[1]] = f'"{str(row[df.columns[1]]).replace("\"", "\"\"")}"'
                row[df.columns[2]] = f'"{str(row[df.columns[2]]).replace("\"", "\"\"")}"'

                f.write(','.join(str(row[col]) for col in df.columns) + '\n')

    except Exception as e:
        print(f"Error: {e}")


In [54]:
preprocessing('CLAN_data.csv')

In [55]:
df = pd.read_csv('CLAN_data_preprocessed.csv')
print(df.loc[1])

PID                                                                  2
Social Media Post    important announcement coronavirus last evenin...
Normalized Claim     If someone with the new coronavirus sneezes, i...
Name: 1, dtype: object
