In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from google.colab import files
import pandas as pd

# Step 2: Read CSV
df = pd.read_csv('mail_data.csv')

# Step 3: Preprocess Data
# Function to convert text to lowercase, remove punctuation, and tokenize
def preprocess_and_tokenize(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation using translate with string.punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Tokenize by splitting by whitespace
    tokens = nltk.word_tokenize(text)

    return tokens

# Apply the preprocessing and tokenization function to the 'Message' column
df['Processed_Message'] = df['Message'].apply(preprocess_and_tokenize)

# Replace 'ham' with 0 and 'spam' with 1 in the 'Category' column
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Lemmatize the tokenized text
lemmatizer = WordNetLemmatizer()
df['Processed_Message'] = df['Processed_Message'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Processed_Message'] = df['Processed_Message'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Display the processed DataFrame
print(df[['Category', 'Processed_Message']])

# Step 4: Save the processed DataFrame to a new CSV file
output_filename = 'lemmatized_without_stopwords_puntuation.csv'
df[['Category', 'Processed_Message']].to_csv(output_filename, index=False)

# Download the file to your local machine
files.download(output_filename)

      Category                                  Processed_Message
0            0  [go, jurong, point, crazy, available, bugis, n...
1            0                     [ok, lar, joking, wif, u, oni]
2            1  [free, entry, 2, wkly, comp, win, fa, cup, fin...
3            0      [u, dun, say, early, hor, u, c, already, say]
4            0  [nah, dont, think, go, usf, life, around, though]
...        ...                                                ...
5567         1  [2nd, time, tried, 2, contact, u, u, £750, pou...
5568         0                 [ü, b, going, esplanade, fr, home]
5569         0                [pity, wa, mood, soany, suggestion]
5570         0  [guy, bitching, acted, like, id, interested, b...
5571         0                                 [rofl, true, name]

[5572 rows x 2 columns]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>