In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tmehul/spamcsv")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/spamcsv


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv(path + "/spam.csv", encoding="latin1")  # Adjust encoding if needed
df = df[['v1', 'v2']]  # Selecting the relevant columns if necessary
df.columns = ['label', 'message']  # Renaming for clarity

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
# Number of SMS messages
num_messages = len(df)

# Number of spam messages
num_spam = len(df[df['label'] == 'spam'])

# Word count of all messages
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

# Average number of words per message
avg_words = df['word_count'].mean()

# Finding the most frequent words
from collections import Counter

word_list = " ".join(df['message']).split()
word_freq = Counter(word_list)

most_common_words = word_freq.most_common(5)
rare_words = sum(1 for word, count in word_freq.items() if count == 1)

# Print results
print(f"Total messages: {num_messages}")
print(f"Spam messages: {num_spam}")
print(f"Average words per message: {avg_words:.2f}")
print(f"5 Most frequent words: {most_common_words}")
print(f"Number of rare words (appear once): {rare_words}")

Total messages: 5572
Spam messages: 747
Average words per message: 15.49
5 Most frequent words: [('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of rare words (appear once): 9268


In [4]:
import nltk

# Download the necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Sometimes useful for lemmatization

# Then proceed with your code

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
import nltk

# Delete the cache and reinstall punkt
nltk.data.path.append('/usr/local/share/nltk_data')  # Ensure correct path
nltk.download('punkt', force=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
import os
import nltk

# Set environment variable
os.environ["NLTK_DATA"] = "/usr/local/share/nltk_data"

# Then try downloading again
nltk.download('punkt', force=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import nltk
import shutil

# Remove previous NLTK data folder (Colab stores it under /root/nltk_data)
shutil.rmtree('/root/nltk_data', ignore_errors=True)

# Force fresh installation of necessary packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Ensure necessary downloads
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Example processing
text = str(df['message'][0])  # Ensure it's a string

# Use RegexpTokenizer instead of word_tokenize()
tokenizer = RegexpTokenizer(r'\w+')  # This extracts only words, avoiding the punkt issue
tokens_nltk = tokenizer.tokenize(text)

lemmas_nltk = [lemmatizer.lemmatize(token) for token in tokens_nltk]
stems_nltk = [stemmer.stem(token) for token in tokens_nltk]

print("NLTK Tokenization (using RegexpTokenizer):", tokens_nltk)
print("NLTK Lemmatization:", lemmas_nltk)
print("NLTK Stemming:", stems_nltk)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NLTK Tokenization (using RegexpTokenizer): ['Go', 'until', 'jurong', 'point', 'crazy', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'got', 'amore', 'wat']
NLTK Lemmatization: ['Go', 'until', 'jurong', 'point', 'crazy', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'got', 'amore', 'wat']
NLTK Stemming: ['go', 'until', 'jurong', 'point', 'crazi', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amor', 'wat']


In [11]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Example processing
text = str(df['message'][0])  # Ensure it's a string
doc = nlp(text)

# Extract tokenization, lemmatization, and stemming (in SpaCy, lemmatization is used instead of stemming)
tokens_spacy = [token.text for token in doc]
lemmas_spacy = [token.lemma_ for token in doc]
stems_spacy = [token.lemma_ for token in doc]  # SpaCy doesn't have separate stemming, so we use lemma_

print("SpaCy Tokenization:", tokens_spacy)
print("SpaCy Lemmatization:", lemmas_spacy)
print("SpaCy 'Stemming' (same as Lemmatization in SpaCy):", stems_spacy)

SpaCy Tokenization: ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
SpaCy Lemmatization: ['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...']
SpaCy 'Stemming' (same as Lemmatization in SpaCy): ['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...']


In [12]:
from collections import Counter

# Create new columns in dataframe to store processed versions
df['tokens_nltk'] = df['message'].apply(lambda x: RegexpTokenizer(r'\w+').tokenize(str(x)))
df['lemmas_nltk'] = df['tokens_nltk'].apply(lambda tokens: [WordNetLemmatizer().lemmatize(token) for token in tokens])
df['stems_nltk'] = df['tokens_nltk'].apply(lambda tokens: [PorterStemmer().stem(token) for token in tokens])

# Repeat for SpaCy
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_processing(text):
    doc = nlp(str(text))
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    stems = lemmas  # Stemming in SpaCy is handled by lemmas
    return tokens, lemmas, stems

df[['tokens_spacy', 'lemmas_spacy', 'stems_spacy']] = df['message'].apply(lambda x: pd.Series(spacy_processing(x)))

# Compute Statistics for Each Version
def compute_stats(column):
    word_list = [word for message in df[column] for word in message]  # Flatten list
    word_freq = Counter(word_list)

    num_messages = len(df)
    num_spam = len(df[df['label'] == 'spam'])
    avg_words = sum(len(message) for message in df[column]) / num_messages
    most_common_words = word_freq.most_common(5)
    rare_words = sum(1 for word, count in word_freq.items() if count == 1)

    return num_messages, num_spam, avg_words, most_common_words, rare_words

# Print stats for tokenized, lemmatized, and stemmed versions
for version in ['tokens_nltk', 'lemmas_nltk', 'stems_nltk', 'tokens_spacy', 'lemmas_spacy', 'stems_spacy']:
    stats = compute_stats(version)
    print(f"\nStatistics for {version}:")
    print(f"Total messages: {stats[0]}")
    print(f"Spam messages: {stats[1]}")
    print(f"Average words per message: {stats[2]:.2f}")
    print(f"5 Most frequent words: {stats[3]}")
    print(f"Number of rare words (appear once): {stats[4]}")


Statistics for tokens_nltk:
Total messages: 5572
Spam messages: 747
Average words per message: 16.17
5 Most frequent words: [('to', 2148), ('I', 2013), ('you', 1896), ('a', 1332), ('the', 1202)]
Number of rare words (appear once): 5606

Statistics for lemmas_nltk:
Total messages: 5572
Spam messages: 747
Average words per message: 16.17
5 Most frequent words: [('to', 2148), ('I', 2013), ('you', 1896), ('a', 1470), ('the', 1202)]
Number of rare words (appear once): 5322

Statistics for stems_nltk:
Total messages: 5572
Spam messages: 747
Average words per message: 16.17
5 Most frequent words: [('i', 3001), ('to', 2242), ('you', 2240), ('a', 1433), ('the', 1328)]
Number of rare words (appear once): 3555

Statistics for tokens_spacy:
Total messages: 5572
Spam messages: 747
Average words per message: 18.58
5 Most frequent words: [('.', 4945), ('to', 2148), ('I', 1988), ('you', 1878), (',', 1857)]
Number of rare words (appear once): 6272

Statistics for lemmas_spacy:
Total messages: 5572
Spa

In [13]:
import requests
from bs4 import BeautifulSoup

# Wikipedia page URL
url = "https://en.wikipedia.org/wiki/Duck"
response = requests.get(url)

# Parse HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Extract page title
title = soup.find("h1").text
print(f"Page Title: {title}")

# Extract all headings (subsections)
headings = [h.text for h in soup.find_all(["h2", "h3"])]
print("\nHeadings Found:")
print("\n".join(headings))

# Extract all paragraph text
paragraphs = [p.text for p in soup.find_all("p")]

# Print first 5 paragraphs
print("\nSample Paragraphs:")
for i in range(5):
    print(f"\nParagraph {i+1}:\n{paragraphs[i]}")

Page Title: Duck

Headings Found:
Contents
Etymology
Taxonomy
Morphology
Distribution and habitat
Behaviour
Feeding
Breeding
Communication
Predators
Relationship with humans
Hunting
Domestication
Heraldry
Cultural references
See also
Notes
Citations
Sources
External links

Sample Paragraphs:

Paragraph 1:



Paragraph 2:
See text


Paragraph 3:
Duck is the common name for numerous species of waterfowl in the family Anatidae. Ducks are generally smaller and shorter-necked than swans and geese, which are members of the same family. Divided among several subfamilies, they are a form taxon; they do not represent a monophyletic group (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly aquatic birds, and may be found in both fresh water and sea water.


Paragraph 4:
Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules and coots.

In [14]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Ensure necessary downloads
nltk.download('wordnet')

# Initialize NLTK tools
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Example processing for scraped text
def process_text_nltk(text):
    tokens = tokenizer.tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    stems = [stemmer.stem(token) for token in tokens]
    return tokens, lemmas, stems

# Assuming 'paragraphs' contains the scraped Wikipedia text
processed_nltk = [process_text_nltk(paragraph) for paragraph in paragraphs]

# Display sample results
for i in range(3):  # Print first 3 processed paragraphs
    print(f"\nParagraph {i+1} (NLTK):")
    print(f"Tokens: {processed_nltk[i][0]}")
    print(f"Lemmas: {processed_nltk[i][1]}")
    print(f"Stems: {processed_nltk[i][2]}")


Paragraph 1 (NLTK):
Tokens: []
Lemmas: []
Stems: []

Paragraph 2 (NLTK):
Tokens: ['See', 'text']
Lemmas: ['See', 'text']
Stems: ['see', 'text']

Paragraph 3 (NLTK):
Tokens: ['Duck', 'is', 'the', 'common', 'name', 'for', 'numerous', 'species', 'of', 'waterfowl', 'in', 'the', 'family', 'Anatidae', 'Ducks', 'are', 'generally', 'smaller', 'and', 'shorter', 'necked', 'than', 'swans', 'and', 'geese', 'which', 'are', 'members', 'of', 'the', 'same', 'family', 'Divided', 'among', 'several', 'subfamilies', 'they', 'are', 'a', 'form', 'taxon', 'they', 'do', 'not', 'represent', 'a', 'monophyletic', 'group', 'the', 'group', 'of', 'all', 'descendants', 'of', 'a', 'single', 'common', 'ancestral', 'species', 'since', 'swans', 'and', 'geese', 'are', 'not', 'considered', 'ducks', 'Ducks', 'are', 'mostly', 'aquatic', 'birds', 'and', 'may', 'be', 'found', 'in', 'both', 'fresh', 'water', 'and', 'sea', 'water']
Lemmas: ['Duck', 'is', 'the', 'common', 'name', 'for', 'numerous', 'specie', 'of', 'waterfowl', 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Example processing for scraped text
def process_text_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    stems = lemmas  # Since SpaCy uses lemmatization instead of traditional stemming
    return tokens, lemmas, stems

# Assuming 'paragraphs' contains the scraped Wikipedia text
processed_spacy = [process_text_spacy(paragraph) for paragraph in paragraphs]

# Display sample results
for i in range(3):  # Print first 3 processed paragraphs
    print(f"\nParagraph {i+1} (SpaCy):")
    print(f"Tokens: {processed_spacy[i][0]}")
    print(f"Lemmas: {processed_spacy[i][1]}")
    print(f"Stems (same as Lemmas in SpaCy): {processed_spacy[i][2]}")


Paragraph 1 (SpaCy):
Tokens: ['\n']
Lemmas: ['\n']
Stems (same as Lemmas in SpaCy): ['\n']

Paragraph 2 (SpaCy):
Tokens: ['See', 'text', '\n']
Lemmas: ['see', 'text', '\n']
Stems (same as Lemmas in SpaCy): ['see', 'text', '\n']

Paragraph 3 (SpaCy):
Tokens: ['Duck', 'is', 'the', 'common', 'name', 'for', 'numerous', 'species', 'of', 'waterfowl', 'in', 'the', 'family', 'Anatidae', '.', 'Ducks', 'are', 'generally', 'smaller', 'and', 'shorter', '-', 'necked', 'than', 'swans', 'and', 'geese', ',', 'which', 'are', 'members', 'of', 'the', 'same', 'family', '.', 'Divided', 'among', 'several', 'subfamilies', ',', 'they', 'are', 'a', 'form', 'taxon', ';', 'they', 'do', 'not', 'represent', 'a', 'monophyletic', 'group', '(', 'the', 'group', 'of', 'all', 'descendants', 'of', 'a', 'single', 'common', 'ancestral', 'species', ')', ',', 'since', 'swans', 'and', 'geese', 'are', 'not', 'considered', 'ducks', '.', 'Ducks', 'are', 'mostly', 'aquatic', 'birds', ',', 'and', 'may', 'be', 'found', 'in', 'both

In [16]:
from google.colab import files

# Upload the file manually from local machine
uploaded = files.upload()

# Load text from file (assuming filename is 'WhatsApp Chat.txt')
file_path = "WhatsApp Chat.txt"
with open(file_path, "r", encoding="utf-8") as file:
    chat_text = file.read()

print("Sample WhatsApp chat text:\n", chat_text[:500])  # Display first 500 characters

Saving WhatsApp Chat.txt to WhatsApp Chat.txt
Sample WhatsApp chat text:
 13/08/2020, 08:19 - Messages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them. Learn more.
13/08/2020, 08:19 - ‎‫אמא‬‎ created group "‎‫כל מה שצריך לקנות‬‎"
13/08/2020, 08:19 - ‎‫אמא‬‎ added you
13/08/2020, 08:19 - אמא: 2 סוכר לבן
1 סוכר דמררה
1 מפיות
13/08/2020, 08:19 - אמא: קפה אדום
13/08/2020, 08:20 - אמא: ביצים
13/08/2020, 08:26 - אמא: סירופ מייפל
13/08/2020, 13:44 - אם יורשה לי: שמנת מתוקה
13/08/2020, 13:45 - משה: חציל
13/08/2020, 13:47 - Danie


In [18]:
import nltk
from nltk.tokenize import RegexpTokenizer

# Ensure necessary downloads
nltk.download('punkt')

# Initialize tokenization
tokenizer = RegexpTokenizer(r'\w+')

# Hebrew stemming function (basic heuristic approach)
def simple_hebrew_stemmer(word):
    suffixes = ["ים", "ות", "ה", "ים", "ות", "י", "ת"]
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Processing function
def process_text_nltk(text):
    tokens = tokenizer.tokenize(text)
    stems = [simple_hebrew_stemmer(token) for token in tokens]  # Apply heuristic stemming
    return tokens, stems  # No official NLTK lemmatization for Hebrew

tokens_nltk, stems_nltk = process_text_nltk(chat_text)

print("\nNLTK Tokenization (Hebrew):", tokens_nltk[:50])
print("NLTK Stemming (Hebrew - heuristic):", stems_nltk[:50])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



NLTK Tokenization (Hebrew): ['13', '08', '2020', '08', '19', 'Messages', 'and', 'calls', 'are', 'end', 'to', 'end', 'encrypted', 'Only', 'people', 'in', 'this', 'chat', 'can', 'read', 'listen', 'to', 'or', 'share', 'them', 'Learn', 'more', '13', '08', '2020', '08', '19', 'אמא', 'created', 'group', 'כל', 'מה', 'שצריך', 'לקנות', '13', '08', '2020', '08', '19', 'אמא', 'added', 'you', '13', '08', '2020']
NLTK Stemming (Hebrew - heuristic): ['13', '08', '2020', '08', '19', 'Messages', 'and', 'calls', 'are', 'end', 'to', 'end', 'encrypted', 'Only', 'people', 'in', 'this', 'chat', 'can', 'read', 'listen', 'to', 'or', 'share', 'them', 'Learn', 'more', '13', '08', '2020', '08', '19', 'אמא', 'created', 'group', 'כל', 'מ', 'שצריך', 'לקנ', '13', '08', '2020', '08', '19', 'אמא', 'added', 'you', '13', '08', '2020']


In [21]:
import spacy

# Load a blank Hebrew tokenizer (since we can't install the full model)
nlp = spacy.blank("he")  # Creates a SpaCy pipeline for Hebrew without pre-trained components

# Processing function for tokenization only (lemmatization won't work without a full model)
def process_text_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = tokens  # Since no Hebrew lemmatizer is available, we'll keep tokens as lemmas
    stems = tokens   # Stemming also isn't supported in SpaCy for Hebrew, so we use tokens

    return tokens, lemmas, stems

# Process the Hebrew WhatsApp chat
tokens_spacy, lemmas_spacy, stems_spacy = process_text_spacy(chat_text)

# Display results
print("\nSpaCy Tokenization (Hebrew):", tokens_spacy[:50])
print("SpaCy Lemmatization (Fallback to Tokens):", lemmas_spacy[:50])
print("SpaCy 'Stemming' (Fallback to Tokens):", stems_spacy[:50])


SpaCy Tokenization (Hebrew): ['13/08/2020', ',', '08:19', '-', 'Messages', 'and', 'calls', 'are', 'end', '-', 'to', '-', 'end', 'encrypted', '.', 'Only', 'people', 'in', 'this', 'chat', 'can', 'read', ',', 'listen', 'to', ',', 'or', 'share', 'them', '.', 'Learn', 'more', '.', '\n', '13/08/2020', ',', '08:19', '-', '\u200e\u202bאמא\u202c\u200e', 'created', 'group', '"', '\u200e\u202bכל', 'מה', 'שצריך', 'לקנות\u202c\u200e', '"', '\n', '13/08/2020', ',']
SpaCy Lemmatization (Fallback to Tokens): ['13/08/2020', ',', '08:19', '-', 'Messages', 'and', 'calls', 'are', 'end', '-', 'to', '-', 'end', 'encrypted', '.', 'Only', 'people', 'in', 'this', 'chat', 'can', 'read', ',', 'listen', 'to', ',', 'or', 'share', 'them', '.', 'Learn', 'more', '.', '\n', '13/08/2020', ',', '08:19', '-', '\u200e\u202bאמא\u202c\u200e', 'created', 'group', '"', '\u200e\u202bכל', 'מה', 'שצריך', 'לקנות\u202c\u200e', '"', '\n', '13/08/2020', ',']
SpaCy 'Stemming' (Fallback to Tokens): ['13/08/2020', ',', '08:19', '-', '