In [None]:

import pandas as pd
import nltk
import spacy

from nltk.corpus import stopwords
from tqdm import tqdm


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

!python -m spacy download en_core_web_sm


In [None]:
texts = [
    "Vaccines cause serious side effects according to reports!",
    "5G towers are spreading harmful radiation.",
    "Government confirms vaccine safety after trials.",
    "Social media claims about microchips in vaccines spread rapidly."
]

df = pd.DataFrame({"text": texts})
df


In [None]:
stop_words = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm")


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)

    doc = nlp(text)
    tokens = [
        token.lemma_ 
        for token in doc 
        if token.text not in stop_words and token.is_alpha
    ]

    return " ".join(tokens)


In [None]:
tqdm.pandas()

df["clean_text"] = df["text"].progress_apply(preprocess_text)
df[["text", "clean_text"]]


In [None]:
df["word_count"] = df["clean_text"].str.split().apply(len)

df = df[df["word_count"] > 3]
df.reset_index(drop=True, inplace=True)

df.head()


In [None]:
df["word_count"].describe()


In [None]:
import matplotlib.pyplot as plt

df["word_count"].hist(bins=20)
plt.title("Word Count After Preprocessing")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()


In [None]:
df.to_csv("../data/processed/clean_text_data.csv", index=False)
print("Processed data saved successfully.")
