In [2]:
import re
from langdetect import detect
import uuid
from datetime import datetime, UTC
import pandas as pd

In [3]:
# read data 
file_path = "../data/raw/sentiment140.csv"
df = pd.read_csv(file_path)

In [4]:
PII_RE = [
    re.compile(r"\b[\w.-]+?@\w+?\.\w{2,4}\b"), # email
    re.compile(r"\b(?:\+?\d{1,3})?[-.\s]?(?:\d{2,4}[-.\s]?){2,}\d{2,4}\b"), # phone
    re.compile(r'https?://\S+|www\.\S+')
]

def redact_pii(text: str) -> str:
    for r in PII_RE:
        text = r.sub("[REDACTED]", text) # replace email, phone number, link with [REDACTED] with
    return text

def simple_clean(text: str) -> str:
    text = text.replace("\n", " ").strip() # replace new line with " "
    text = re.sub(r"\s+", " ", text) # whitespace to " "
    return text

def preprocess_record(raw_text: str, source="unknown", label=None):
    text = redact_pii(raw_text)
    text = simple_clean(text)

    try:
        lang = detect(text)
    except Exception:
        lang = "unknow"
    rec = {
        "id": str(uuid.uuid4()),
        "source": source,
        "text": raw_text,
        "clean_text": text.lower(),
        "lang": lang,
        "sentiment": None if label is None else ("negative" if label==0 else "positive"),
        "label": label,
        "timestamp": datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%SZ'),
        "metadata": {}
    }
    return rec



In [5]:
test_raw_pos = df.iloc[1597998:-1, :]


In [6]:
test_raw_neg = df.iloc[0:2000, :]


In [7]:
test_df = pd.concat([test_raw_neg, test_raw_pos])
test_df = test_df.rename(columns={
    test_df.columns[0]: "Target",
    test_df.columns[1]: "ID",
    test_df.columns[2]: "Time",
    test_df.columns[3]: "Query",
    test_df.columns[4]: "User",
    test_df.columns[5]: "Tweet"

})
print(test_df.shape)

(4000, 6)


In [8]:
test_df = test_df.rename(columns={
    test_df.columns[0]: "Target",
    test_df.columns[1]: "ID",
    test_df.columns[2]: "Time",
    test_df.columns[3]: "Query",
    test_df.columns[4]: "User",
    test_df.columns[5]: "Tweet"

})

In [9]:
test_df['Target'] = test_df['Target'].replace(4, 1)

In [10]:
test_df.tail()

Unnamed: 0,Target,ID,Time,Query,User,Tweet
1599993,1,2193579489,Tue Jun 16 08:39:00 PDT 2009,NO_QUERY,EvolveTom,"@Cliff_Forster Yeah, that does work better tha..."
1599994,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [11]:
file_to = "../data/test_raw_data/test_raw_data.csv"
test_df.to_csv(file_to, index=False)

In [12]:
print(preprocess_record(test_df['Tweet'][0]))

{'id': '1386aabf-781a-463f-b28c-5550834732c8', 'source': 'unknown', 'text': "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 'clean_text': "is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!", 'lang': 'en', 'sentiment': None, 'label': None, 'timestamp': '2025-09-19T13:00:36Z', 'metadata': {}}
