# 1- Libreris

In [16]:
import spacy
import pandas as pd
from bs4 import BeautifulSoup

# 2- Load Dataset

In [17]:
# path to Parquet file
dataset = pd.read_parquet("../data/imdb/plain_text/train-00000-of-00001.parquet")
# Access text data (assuming column name is 'text')
text_data = dataset["text"]
# Access class label data (assuming column name is 'label')
class_labels = dataset["label"]

# 3- pre processing  function

In [36]:
def preprocess_review(text):
    """
    preprocesses a single text review using spaCy.

    Args:
        text (str): The text of the movie review.

    Returns:
        List: The preprocessed text review.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove HTML tags
    text = remove_html_tags(text)

    # Use spaCy for batch processing
    nlp = spacy.load("en_core_web_sm")
    processed_docs = nlp.pipe(text, batch_size=512 ,n_process=-1)


    preprocessed_text = []
    for doc in processed_docs:
        # Since nlp.pipe returns an iterator of Doc objects, access individual tokens within the loop
        tokens = [token.text for token in doc if token.is_alpha and token.text not in spacy.lang.en.stop_words.STOP_WORDS]
        preprocessed_text.append(' '.join(tokens))

    return preprocessed_text


def remove_html_tags(text):
    """
    Removes HTML tags from text using BeautifulSoup

    Args:
        text (str): Text containing HTML tags.

    Returns:
        str: Text with HTML tags removed.
    """
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

 # 4- Run and save to pandas data frame

In [None]:
# Preprocess text data
preprocessed_text = []
for review in text_data:
    preprocessed_text.append(preprocess_review(review))

df = pd.DataFrame(
    {
        "original_text": text_data,
        "preprocessed_text": preprocessed_text,
        "label": class_labels,
    }
)
df.to_csv("preprocessed_data.csv", index=False)  # Save as CSV