In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

# Load the stop words
stop_words = set(stopwords.words('english'))

# Initialize the stemmer
stemmer = SnowballStemmer('english')

# Load the CSV file into a pandas dataframe
df = pd.read_csv('../Data_Collection/final_data.csv', encoding="ISO-8859-1")

# Drop rows with missing values
df.dropna(inplace=True)

# Define a function to preprocess the text
def preprocess_text(text):
    # Check if text is a string
    if isinstance(text, str):
        # Convert to lower case
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stop words
        tokens = [token for token in tokens if token not in stop_words]

        # Stem the tokens
        tokens = [stemmer.stem(token) for token in tokens]

        return tokens
    else:
        # Return an empty list for non-string values
        return []

# Apply the preprocess_text function to each cell in the dataframe
for col in df.columns:
    df[col] = df[col].apply(preprocess_text)

# Save the tokenized DataFrame to Excel
df.to_excel("tokenized_data.xlsx")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aikar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aikar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
