# *Setup Enviorment*

In [13]:
# prompt: Install these Python libraries: nltk, spaCy, BeautifulSoup

!pip install nltk spacy beautifulsoup4




In [14]:
import os
from google.colab import drive
import pandas as pd
from collections import Counter
import nltk
import spacy
from time import time

 **Set Google Drive**


In [15]:

drive.mount('/content/drive')

Mounted at /content/drive


In [16]:

GOOGLE_DRIVE_PATH = '/content/drive/MyDrive'
print(os.listdir('/content/drive'))

['Othercomputers', '.file-revisions-by-id', '.shortcut-targets-by-id', 'MyDrive', '.Trash-0']


# installations

# Load **CSV** and display details

In [17]:




df = pd.read_csv(os.path.join(GOOGLE_DRIVE_PATH, 'spam.csv'), encoding='latin-1')
df.head()

# Drop unnecessary columns and rename the necessary ones
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df

print(df.head())


     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [18]:
# prompt: Print basic statistics on the data:
# - Total number of SMS messages
# - Number of spam/ham messages
# - Average number of words per message
# - 5 most frequent words
# - Number of words that only appear once

def print_statistics(df):
    print('Total number of SMS messages:', len(df))
    print('Number of spam messages:', len(df[df['v1'] == 'spam']))
    print('Number of ham messages:', len(df[df['v1'] == 'ham']))

    # Average number of words per message
    total_words = 0
    for message in df['v2']:
      total_words += len(message.split())
    average_words_per_message = total_words / len(df)
    print('Average number of words per message:', average_words_per_message)

    # 5 most frequent words

    all_words = ' '.join(df['v2']).split()
    word_counts = Counter(all_words)
    most_frequent_words = word_counts.most_common(5)
    print('5 most frequent words:')
    for word, count in most_frequent_words:
      print(f'- {word}: {count}')

    # Number of words that only appear once
    unique_words = set(all_words)
    once_words = [word for word in unique_words if word_counts[word] == 1]
    print('Number of words that only appear once:', len(once_words))

print_statistics(df)


Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Average number of words per message: 15.494436468054559
5 most frequent words:
- to: 2134
- you: 1622
- I: 1466
- a: 1327
- the: 1197
Number of words that only appear once: 9268


# Tokenize Dataset

In [19]:



# Download required resources
nltk.download('punkt')
spacy.load('en_core_web_sm')

# Define the text to be tokenized
text = df['v2'].iloc[0]

# Tokenize using NLTK
start_time = time()
nltk_tokens = nltk.word_tokenize(text)
nltk_time = time() - start_time

# Tokenize using spaCy
start_time = time()
spacy_tokens = spacy.load('en_core_web_sm')(text)
spacy_time = time() - start_time

# Print the results
print("NLTK Tokenization:")
print("- Tokens:", nltk_tokens)
print("- Time:", nltk_time)

print("\nspaCy Tokenization:")
print("- Tokens:", [token.text for token in spacy_tokens])
print("- Time:", spacy_time)

#Output format: NLTK returns a list of strings, while spaCy returns a list of spaCy Token objects.
#Processing speed: spaCy is generally faster than NLTK for larger texts.
#Language support:spaCy supports more languages than NLTK
print_statistics(df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NLTK Tokenization:
- Tokens: ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
- Time: 0.033014774322509766

spaCy Tokenization:
- Tokens: ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
- Time: 3.3392884731292725
Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Average number of words per message: 15.494436468054559
5 most frequent words:
- to: 2134
- you: 1622
- I: 1466
- a: 1327
- the: 1197
Number of words that only appear once: 9268


# **Lemmatize the SMS text**

In [20]:
# prompt:  Lemmatize the SMS text using nltk and spaCy. Analyze the time complexity of the
# lemmatization algorithm

# Download required resources
nltk.download('wordnet')
spacy.load('en_core_web_sm')
nltk.download('punkt')

# Define the text to be lemmatized
text = df['v2'].iloc[0]

# Lemmatize using NLTK
start_time = time()
nltk_lemmas = [nltk.wordnet.WordNetLemmatizer().lemmatize(word) for word in nltk.word_tokenize(text)]
nltk_time = time() - start_time

# Lemmatize using spaCy
start_time = time()
spacy_lemmas = [token.lemma_ for token in spacy.load('en_core_web_sm')(text)]
spacy_time = time() - start_time

# Print the results
print("NLTK Lemmatization:")
print("- Lemmas:", nltk_lemmas)
print("- Time:", nltk_time)

print("\nspaCy Lemmatization:")
print("- Lemmas:", spacy_lemmas)
print("- Time:", spacy_time)

#Output format:NLTK returns a list of strings, while spaCy returns a list of strings or spaCy Lemma objects.

#Processing speed: spaCy is generally faster than NLTK for larger texts.

#Language support: spaCy supports more languages than NLTK.

print_statistics(df)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NLTK Lemmatization:
- Lemmas: ['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...']
- Time: 4.1024298667907715

spaCy Lemmatization:
- Lemmas: ['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...']
- Time: 1.8969440460205078
Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Average number of words per message: 15.494436468054559
5 most frequent words:
- to: 2134
- you: 1622
- I: 1466
- a: 1327
- the: 1197
Number of words that only appear once: 9268


# **Stem the SMS text**

In [21]:
# prompt: Stem the SMS text using nltk and spaCy. Analyze the time complexity of the stemming
# algorithm.

# Download required resources
nltk.download('punkt')
spacy.load('en_core_web_sm')
nltk.download('snowball_data')

# Define the text to be stemmed
text = df['v2'].iloc[0]

# Stem using NLTK
start_time = time()
nltk_stems = [nltk.stem.PorterStemmer().stem(word) for word in nltk.word_tokenize(text)]
nltk_time = time() - start_time

# Stem using spaCy
start_time = time()
spacy_stems = [token.lemma_ for token in spacy.load('en_core_web_sm')(text)]
spacy_time = time() - start_time

# Print the results
print("NLTK Stemming:")
print("- Stems:", nltk_stems)
print("- Time:", nltk_time)

print("\nspaCy Stemming:")
print("- Stems:", spacy_stems)
print("- Time:", spacy_time)



print_statistics(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package snowball_data to /root/nltk_data...


NLTK Stemming:
- Stems: ['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...']
- Time: 0.002018451690673828

spaCy Stemming:
- Stems: ['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...']
- Time: 0.7560272216796875
Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Average number of words per message: 15.494436468054559
5 most frequent words:
- to: 2134
- you: 1622
- I: 1466
- a: 1327
- the: 1197
Number of words that only appear once: 9268


# Web Scraping Yahoo finance(found easy import)

In [22]:
from bs4 import BeautifulSoup
import requests

url = "https://finance.yahoo.com/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAACHRzA667AARXS__2v5de8mNTKOyFF7jW-cvNi0GO_xT-fg69PydO0Xa5iRXetlJA6ZBGR8ToumsedNBLhkHE7kaWLicN4ilmYfg8EShk1l-Suxct2famqmVFYAZhpPEHLJQLWiwAb9eATIGeIh_NniR12bb44fwIwKtuw-o1c1i"  # Replace with the actual URL
response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extracting post texts from a hypothetical social media profile
    posts = soup.find_all('p')  # This will vary based on the actual HTML structure
    for post in posts:
        post.get_text()
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

#creating dataset
len(posts)
df = pd.DataFrame(posts, columns=['posts'])
print(df)

                                                posts
0   We are experiencing some temporary issues. The...
1   Retail sales barely increased in May, missing ...
2   Family caregivers spend over a quarter of thei...
3   The answer might surprise you. Here’s how much...
4   Retail sales increased less than expected in M...
5   The retail sales report comes as economists co...
6   Just as inflation is starting to trend better,...
7   Consumers sentiment hit a seven-month low as p...
8   Some blue-collar jobs can earn you well over s...
9   Every bank customer likely wonders: “How much ...
10  My monthly Social Security is $3,178, my pensi...
11  The Dow Jones was firm on the stock market tod...
12  The widely followed growth investor made big m...
13  CNH Industrial is a global manufacturer of hea...
14  This technology ETF is crushing the return of ...
15  (Bloomberg) -- Fisker Inc. filed for bankruptc...
16  From 2019 to 2022, the number of Americans rec...
17  During the past decade, 

**# Basic Analysis - Before Proccessing**

In [23]:
# prompt: print basic word statistics  on the scraped text

# Number of words
total_words = 0
for post in posts:
  total_words += len(post.get_text().split())
print('Total number of words:', total_words)

# Average number of words per post
average_words_per_post = total_words / len(posts)
print('Average number of words per post:', average_words_per_post)

# 5 most frequent words
all_words = ' '.join([post.get_text() for post in posts]).split()
word_counts = Counter(all_words)
most_frequent_words = word_counts.most_common(5)
print('5 most frequent words:')
for word, count in most_frequent_words:
  print(f'- {word}: {count}')

# Number of words that only appear once
unique_words = set(all_words)
once_words = [word for word in unique_words if word_counts[word] == 1]
print('Number of words that only appear once:', len(once_words))


Total number of words: 770
Average number of words per post: 40.526315789473685
5 most frequent words:
- to: 24
- the: 23
- a: 19
- of: 17
- in: 16
Number of words that only appear once: 402


# tokenization, lemmatization, and stemming on the scraped text.

In [24]:
# prompt: Perform tokenization, lemmatization, and stemming on the scraped text.

# Import required libraries
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('snowball_data')
import spacy

# Define the text to be processed
text = " ".join(post.get_text() for post in posts)

# Tokenization
nltk_tokens = nltk.word_tokenize(text)
spacy_tokens = spacy.load('en_core_web_sm')(text)

# Lemmatization
nltk_lemmas = [nltk.wordnet.WordNetLemmatizer().lemmatize(word) for word in nltk_tokens]
spacy_lemmas = [token.lemma_ for token in spacy_tokens]

# Stemming
nltk_stems = [nltk.stem.PorterStemmer().stem(word) for word in nltk_tokens]
spacy_stems = [token.lemma_ for token in spacy_tokens]

# Print the results
print("NLTK Tokenization:")
print("- Tokens:", nltk_tokens)

print("\nspaCy Tokenization:")
print("- Tokens:", [token.text for token in spacy_tokens])

print("\nNLTK Lemmatization:")
print("- Lemmas:", nltk_lemmas)

print("\nspaCy Lemmatization:")
print("- Lemmas:", spacy_lemmas)

print("\nNLTK Stemming:")
print("- Stems:", nltk_stems)

print("\nspaCy Stemming:")
print("- Stems:", spacy_stems)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package snowball_data to /root/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


NLTK Tokenization:
- Tokens: ['We', 'are', 'experiencing', 'some', 'temporary', 'issues', '.', 'The', 'market', 'data', 'on', 'this', 'page', 'is', 'currently', 'delayed', '.', 'Please', 'bear', 'with', 'us', 'as', 'we', 'address', 'this', 'and', 'restore', 'your', 'personalized', 'lists', '.', 'Retail', 'sales', 'barely', 'increased', 'in', 'May', ',', 'missing', 'expectations', 'for', 'a', 'more', 'sizable', 'gain', '.', 'Family', 'caregivers', 'spend', 'over', 'a', 'quarter', 'of', 'their', 'income', 'on', 'caregiving', 'activities', ',', 'according', 'to', 'a', '2021', 'AARP', 'survey', '.', 'Lazetta', '&', 'Associates', 'Founder', 'and', 'CEO', 'Lazetta', 'Rainey', 'Braxton', 'joins', 'Wealth', '!', 'to', 'break', 'down', 'some', 'of', 'the', 'challenges', 'facing', 'caregivers', '.', 'Braxton', 'notes', 'that', 'caregivers', 'are', 'concerned', 'about', 'in-home', 'care', ',', 'home', 'maintenance', 'care', ',', 'and', 'the', 'time', 'necessary', 'to', 'be', 'a', 'caregiver', '—'

**Statistics after data proccesing**

In [25]:
# Number of words
total_words = 0
for post in posts:
  total_words += len(post.get_text().split())
print('Total number of words:', total_words)

# Average number of words per post
average_words_per_post = total_words / len(posts)
print('Average number of words per post:', average_words_per_post)

# 5 most frequent words
all_words = ' '.join([post.get_text() for post in posts]).split()
word_counts = Counter(all_words)
most_frequent_words = word_counts.most_common(5)
print('5 most frequent words:')
for word, count in most_frequent_words:
  print(f'- {word}: {count}')

# Number of words that only appear once
unique_words = set(all_words)
once_words = [word for word in unique_words if word_counts[word] == 1]
print('Number of words that only appear once:', len(once_words))


Total number of words: 770
Average number of words per post: 40.526315789473685
5 most frequent words:
- to: 24
- the: 23
- a: 19
- of: 17
- in: 16
Number of words that only appear once: 402


# WhatApp massages Analysis

**Load txt with massages**

---



In [46]:


import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set the path to the .txt file
file_path = '/content/drive/MyDrive/WhatsApp Chat with Friends.txt'

# Read the file contents
with open(file_path, 'r', encoding='utf-8') as f:
  messages = f.readlines()

# Print the first 5 messages
for message in messages[:5]:
  print(message)

# Create DataFrame
df = pd.DataFrame(messages, columns=['message'])
# Display the DataFrame
print(df)
df.to_csv("/content/whatsapp_data.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[17:57, 28/05/2024] +972 53-235-5578: מליאת מועצת העיר מן המניין תחל הערב, שלישי ה - 28.5, בשעה 18:00 

ותשודר בשידור ישיר בעמוד הפייסבוק ובערוץ היוטיוב העירוני.



הצטרפו אלינו:



                                                message
0     [17:57, 28/05/2024] +972 53-235-5578: מליאת מו...
1     ותשודר בשידור ישיר בעמוד הפייסבוק ובערוץ היוטי...
2                                                    \n
3                                       הצטרפו אלינו:\n
4                                                    \n
...                                                 ...
1585  [23:04, 16/04/2024] +972 50-423-9908: הצלחתי ת...
1586  [23:12, 16/04/2024] Daniel: זה 32 לא 28 דרך אגב\n
1587  [23:12, 16/04/2024] +972 50-423-9908: חחחחח כן...
1588        [23:13, 16/04/2024] Daniel: יאללה בהצלחה!\n
1589  [23:13, 16/04/2024] +972 50-423-9908: תודהה גם לך

[1590 rows x 1 c

**Before Proccessing**

In [37]:
# Combine all text into a single string
all_words = ' '.join(df['message']).lower()

print("---------Before Text Processing-------")
# Number of words
total_words_before = 0
for post in posts:
  total_words_before += len(post.get_text().split())

# Average number of words per post
average_words_per_post_before = total_words_before / len(posts)

# 5 most frequent words
all_words_before = ' '.join([post.get_text() for post in posts]).split()
word_counts_before = Counter(all_words_before)
most_frequent_words_before= word_counts_before.most_common(5)

# Number of words that only appear once
unique_words_before = set(all_words_before)
once_words_before = [word for word in unique_words_before if word_counts_before[word] == 1]
print("- Total Words:")

print("  - Before Processing:", total_words_before)

print("\n- Average Words per Post:")

print("  - Before Processing:", average_words_per_post_before)

print("\n- Most Frequent Words:")
print("  - Before Processing:", most_frequent_words_before)

print("\n- Words Appearing Once:")
print("  - Before Processing:", len(once_words_before))


---------Before Text Processing-------
- Total Words:
  - Before Processing: 770

- Average Words per Post:
  - Before Processing: 40.526315789473685

- Most Frequent Words:
  - Before Processing: [('to', 24), ('the', 23), ('a', 19), ('of', 17), ('in', 16)]

- Words Appearing Once:
  - Before Processing: 402


In [48]:
# prompt:  Tokenize, lemmatize, and stem the WhatsApp data. in Hebrew using NLTK only

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Define the text to be processed
text = " ".join(df['message']).lower()

# Tokenization
nltk_tokens = word_tokenize(text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
nltk_lemmas = [lemmatizer.lemmatize(word) for word in nltk_tokens]

# Stemming
stemmer = PorterStemmer()
nltk_stems = [stemmer.stem(word) for word in nltk_tokens]

# Print the results
print("NLTK Tokenization:")
print("- Tokens:", nltk_tokens)

print("\nNLTK Lemmatization:")
print("- Lemmas:", nltk_lemmas)

print("\nNLTK Stemming:")
print("- Stems:", nltk_stems)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


NLTK Tokenization:
- Tokens: ['[', '17:57', ',', '28/05/2024', ']', '+972', '53-235-5578', ':', 'מליאת', 'מועצת', 'העיר', 'מן', 'המניין', 'תחל', 'הערב', ',', 'שלישי', 'ה', '-', '28.5', ',', 'בשעה', '18:00', 'ותשודר', 'בשידור', 'ישיר', 'בעמוד', 'הפייסבוק', 'ובערוץ', 'היוטיוב', 'העירוני', '.', 'הצטרפו', 'אלינו', ':', 'לצפייה', 'בפייסבוק', '>', '>', 'https', ':', '//bit.ly/3r5hs6x', 'לצפייה', 'ביוטיוב', '>', '>', 'https', ':', '//bit.ly/4bwyq16', 'כבר', 'מתחילים', '!', 'הישארו', 'מעודכנים', ',', 'הצטרפו', 'לווטסאפ', 'של', 'עיריית', 'נס', 'ציונה', '-', 'https', ':', '//nzc.toshavil.co.il/enter_free_group', '[', '18:27', ',', '29/05/2024', ']', '+972', '53-235-5578', ':', '``', 'עושים', 'כסף', "''", '-', 'יריד', 'התעסוקה', 'לנוער', 'חוזר', 'ובענק', '🥳', 'לקראת', 'חופשת', 'הקיץ', ',', 'אנחנו', 'מביאים', 'אליכם', 'מאות', 'משרות', 'שוות', 'לבני', 'ובנות', 'הנוער', 'בנס', 'ציונה', '!', '👈', 'אז', 'אם', 'אתם', 'מעל', 'גיל', '16', '👈', 'מסיימים', 'את', 'שנה', "''", 'ל', 'בקרוב', ',', 'ומחפשים', '

**After Proccessing**

In [49]:




# Number of words
total_words_after = 0
for post in posts:
  total_words_after += len(post.get_text().split())

# Average number of words per post
average_words_per_post_after = total_words_after / len(posts)

# 5 most frequent words
all_words_after = ' '.join([post.get_text() for post in posts]).split()
word_counts_after = Counter(all_words_after)
most_frequent_words_after = word_counts_after.most_common(5)

# Number of words that only appear once
unique_words_after = set(all_words_after)
once_words_after = [word for word in unique_words_after if word_counts_after[word] == 1]


# **Comparisons**

print("Comparisons of Word Statistics Before and After Processing:")

print("- Total Words:")

print("  - After Processing:", total_words_after)

print("\n- Average Words per Post:")

print("  - After Processing:", average_words_per_post_after)

print("\n- Most Frequent Words:")

print("  - After Processing:", most_frequent_words_after)

print("\n- Words Appearing Once:")

print("  - After Processing:", len(once_words_after))


Comparisons of Word Statistics Before and After Processing:
- Total Words:
  - After Processing: 770

- Average Words per Post:
  - After Processing: 40.526315789473685

- Most Frequent Words:
  - After Processing: [('to', 24), ('the', 23), ('a', 19), ('of', 17), ('in', 16)]

- Words Appearing Once:
  - After Processing: 402
