<a href="https://colab.research.google.com/github/DeanAvram/Text-Processing/blob/main/Text_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading & Basic Analysis

In [28]:
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.corpus import stopwords
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import time



sms = pd.read_csv("/content/spam.csv", encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
!python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
def clean(text):
  sms = re.sub('[^a-zA-Z]', ' ', text) #Replacing all non-alphabetic characters with a space
  sms = sms.lower() #converting to lowecase
  #sms = sms.split()
  #sms = ' '.join(sms)
  return sms
sms['message'] = sms['message'].apply(clean)

In [31]:
def remove_stopwords(text):
  stop_words = nltk.corpus.stopwords.words('english')
  #str_text = [str(word) for word in text]
  #filtered_text = list(filter(lambda word: word not in stop_words, str_text))
  filtered_text = list(filter(lambda x: (not isinstance(x, spacy.tokens.token.Token) and x not in stop_words) or (isinstance(x, spacy.tokens.token.Token) and x.text not in stop_words), text))
  #print('\n')
  return filtered_text

In [32]:
def most_frequent_words(df, col_name):
  all_text = ' '.join(df[col_name].astype(str).tolist())
  words = re.findall(r'\b\w+\b', all_text.lower())
  words_series = pd.Series(words)
  word_counts = words_series.value_counts()
  return word_counts

In [33]:
def print_statistics_on_df(df, col_name):
  total_sms = df.shape[0]
  ham_count = df['label'].value_counts()['ham']
  spam_count = df['label'].value_counts()['spam']
  num_words = df[col_name].apply(lambda x: len(x) if isinstance(x, (list, spacy.tokens.doc.Doc)) else len(x.split()))
  frequent_words = most_frequent_words(df, col_name)
  unique_words = frequent_words[frequent_words == 1].count()

  print(f"Total number of messages: {total_sms}")
  print(f"Total number of HAM: {ham_count}")
  print(f"Total number of SPAM: {spam_count}")
  print(f"Average number of words per message: {np.mean(num_words)}")
  print(f"Most frequent words:\n{frequent_words.head(5)}")
  print(f"Number of words that only appear once: {unique_words}")

In [34]:
from collections import Counter


def print_statistics_on_text(text: list):
  total_words = len(text)
  word_counts = Counter(text)
  most_frequent_words = word_counts.most_common(5)


  print(f"Total number of words: {total_words}")
  print("Most frequent words:")
  for word, count in most_frequent_words:
    print(f"{word}: {count}")

In [35]:
print_statistics_on_df(sms, 'message')

Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 15.588478104809763
Most frequent words:
i      3018
you    2243
to     2242
a      1451
the    1332
Name: count, dtype: int64
Number of words that only appear once: 3801


# Text Processing

## Tokenize

### NLTK

In [36]:
nltk.download('punkt')

start_time = time.time()
sms['nltk_tokenize_message'] = sms['message'].apply(lambda x: remove_stopwords(nltk.word_tokenize(x)))
nltk_tokenize_time = time.time() - start_time
sms

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message,nltk_tokenize_message
0,ham,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]"
...,...,...,...
5567,spam,this is the nd time we have tried contact u...,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,ham,will b going to esplanade fr home,"[b, going, esplanade, fr, home]"
5569,ham,pity was in mood for that so any other s...,"[pity, mood, suggestions]"
5570,ham,the guy did some bitching but i acted like i d...,"[guy, bitching, acted, like, interested, buyin..."


### SpaCy

In [37]:
nlp = English()

tokenizer = Tokenizer(nlp.vocab)
start_time = time.time()
sms['sapcy_tokenize_message'] = sms['message'].apply(lambda x: remove_stopwords(tokenizer(x)))
spacy_tokenize_time = time.time() - start_time
sms

Unnamed: 0,label,message,nltk_tokenize_message,sapcy_tokenize_message
0,ham,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, , crazy, , available, b..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, , joking, wif, u, oni, ]"
2,spam,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, , wkly, comp, win, fa, cup, fi..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, , u, c, already, ..."
4,ham,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]","[nah, think, goes, usf, , lives, around, though]"
...,...,...,...,...
5567,spam,this is the nd time we have tried contact u...,"[nd, time, tried, contact, u, u, pound, prize,...","[ , nd, time, tried, , contact, u, , u, ..."
5568,ham,will b going to esplanade fr home,"[b, going, esplanade, fr, home]","[ , b, going, esplanade, fr, home]"
5569,ham,pity was in mood for that so any other s...,"[pity, mood, suggestions]","[pity, , mood, , , suggestions]"
5570,ham,the guy did some bitching but i acted like i d...,"[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin..."


### Tokenization Statistics

In [45]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_tokenize_message')
print(f"NLTK Processing Time: {nltk_tokenize_time}")
print("\n\nspacy statistics")
print_statistics_on_df(sms, 'sapcy_tokenize_message')
print(f"SpaCy Processing Time: {spacy_tokenize_time}")

nltk statistics
Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 8.979720028715004
Most frequent words:
u       1212
call     606
get      397
ur       385
gt       318
Name: count, dtype: int64
Number of words that only appear once: 3792
NLTK Processing Time: 1.759016513824463


spacy statistics
Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 11.60480976310122
Most frequent words:
u       1212
call     606
get      397
ur       385
gt       318
Name: count, dtype: int64
Number of words that only appear once: 3793
SpaCy Processing Time: 2.0703468322753906


### Tokenization Comparison

We can see that the words statistics are almost the same. It says that the tokenization process tokenizes the text almost to the same tokens in both techniques.
The NLTK tokenization splits the text into a list of tokens, in contrast to the SpaCy process that produces elements of Token objects.
The processing time is almost the same.

## Lemmatize

### NLTK

In [39]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
start_time = time.time()
sms['nltk_lemmatize_message'] = sms['nltk_tokenize_message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
nltk_lemmatize_time = time.time() - start_time
sms

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,message,nltk_tokenize_message,sapcy_tokenize_message,nltk_lemmatize_message
0,ham,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, , crazy, , available, b...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, , joking, wif, u, oni, ]","[ok, lar, joking, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, , wkly, comp, win, fa, cup, fi...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, , u, c, already, ...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]","[nah, think, goes, usf, , lives, around, though]","[nah, think, go, usf, life, around, though]"
...,...,...,...,...,...
5567,spam,this is the nd time we have tried contact u...,"[nd, time, tried, contact, u, u, pound, prize,...","[ , nd, time, tried, , contact, u, , u, ...","[nd, time, tried, contact, u, u, pound, prize,..."
5568,ham,will b going to esplanade fr home,"[b, going, esplanade, fr, home]","[ , b, going, esplanade, fr, home]","[b, going, esplanade, fr, home]"
5569,ham,pity was in mood for that so any other s...,"[pity, mood, suggestions]","[pity, , mood, , , suggestions]","[pity, mood, suggestion]"
5570,ham,the guy did some bitching but i acted like i d...,"[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin..."


### SpaCy

In [40]:
nlp = spacy.load('en_core_web_sm')
start_time = time.time()
sms['spacy_lemmatize_message'] = sms['sapcy_tokenize_message'].apply(lambda x: ' '.join(s.text for s in x))
sms['spacy_lemmatize_message'] = sms['spacy_lemmatize_message'].apply(lambda x: ' '.join([token.lemma_ for token in nlp((x))]).split())
spacy_lemmatize_time = time.time() - start_time
sms

Unnamed: 0,label,message,nltk_tokenize_message,sapcy_tokenize_message,nltk_lemmatize_message,spacy_lemmatize_message
0,ham,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, , crazy, , available, b...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, , joking, wif, u, oni, ]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, , wkly, comp, win, fa, cup, fi...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, , u, c, already, ...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]","[nah, think, goes, usf, , lives, around, though]","[nah, think, go, usf, life, around, though]","[nah, think, go, usf, live, around, though]"
...,...,...,...,...,...,...
5567,spam,this is the nd time we have tried contact u...,"[nd, time, tried, contact, u, u, pound, prize,...","[ , nd, time, tried, , contact, u, , u, ...","[nd, time, tried, contact, u, u, pound, prize,...","[nd, time, try, contact, u, u, pound, prize, c..."
5568,ham,will b going to esplanade fr home,"[b, going, esplanade, fr, home]","[ , b, going, esplanade, fr, home]","[b, going, esplanade, fr, home]","[b, go, esplanade, fr, home]"
5569,ham,pity was in mood for that so any other s...,"[pity, mood, suggestions]","[pity, , mood, , , suggestions]","[pity, mood, suggestion]","[pity, mood, suggestion]"
5570,ham,the guy did some bitching but i acted like i d...,"[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin...","[guy, bitching, act, like, interested, buy, so..."


### Lemmatization Statistics

In [42]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_lemmatize_message')
print(f"NLTK Processing Time: {nltk_lemmatize_time}")
print("\n\nspacy statistics")
print_statistics_on_df(sms, 'spacy_lemmatize_message')
print(f"SpaCy Processing Time: {spacy_lemmatize_time}")

nltk statistics
Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 8.979720028715004
Most frequent words:
u       1279
call     639
get      408
ur       385
gt       318
Name: count, dtype: int64
Number of words that only appear once: 3493
NLTK Processing Time: 0.35895824432373047


spacy statistics
Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 9.069992821249103
Most frequent words:
u       1212
get      704
call     692
go       606
ur       385
Name: count, dtype: int64
Number of words that only appear once: 3278
SpaCy Processing Time: 64.40595936775208


### Lemmatization Comparison

After the lemmatization, we can see that the word statistics is a bit different between the two methods. With NLTK, there are fewer tokens, which means that NLTK refers to more words as the same lemma. Also, the frequent words appear in different amounts in each technique. That means that the two techniques refer to tokens a bit differently.
The processing time is much better with NLTK, as processing with SpaCy took significantly more time.

## Stem

### NLTK

In [43]:
nltk.download('punkt')
stemmer = nltk.PorterStemmer()
start_time = time.time()
sms['nltk_stem_message'] = sms['nltk_tokenize_message'].apply(lambda x: [stemmer.stem(word) for word in x])
nltk_stem_time = time.time() - start_time
sms

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message,nltk_tokenize_message,sapcy_tokenize_message,nltk_lemmatize_message,spacy_lemmatize_message,nltk_stem_message
0,ham,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, , crazy, , available, b...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, , joking, wif, u, oni, ]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, , wkly, comp, win, fa, cup, fi...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, , u, c, already, ...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]","[nah, think, goes, usf, , lives, around, though]","[nah, think, go, usf, life, around, though]","[nah, think, go, usf, live, around, though]","[nah, think, goe, usf, live, around, though]"
...,...,...,...,...,...,...,...
5567,spam,this is the nd time we have tried contact u...,"[nd, time, tried, contact, u, u, pound, prize,...","[ , nd, time, tried, , contact, u, , u, ...","[nd, time, tried, contact, u, u, pound, prize,...","[nd, time, try, contact, u, u, pound, prize, c...","[nd, time, tri, contact, u, u, pound, prize, c..."
5568,ham,will b going to esplanade fr home,"[b, going, esplanade, fr, home]","[ , b, going, esplanade, fr, home]","[b, going, esplanade, fr, home]","[b, go, esplanade, fr, home]","[b, go, esplanad, fr, home]"
5569,ham,pity was in mood for that so any other s...,"[pity, mood, suggestions]","[pity, , mood, , , suggestions]","[pity, mood, suggestion]","[pity, mood, suggestion]","[piti, mood, suggest]"
5570,ham,the guy did some bitching but i acted like i d...,"[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin...","[guy, bitching, acted, like, interested, buyin...","[guy, bitching, act, like, interested, buy, so...","[guy, bitch, act, like, interest, buy, someth,..."


### SpaCy

In [25]:
#There is no Stemming library in SpaCy

### Stem Statistics

In [44]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_stem_message')
print(f"NLTK Processing Time: {nltk_stem_time}")
#print("\n\nspacy statistics")
#print_statistics_on_df(sms, 'spacy_stem_message')

nltk statistics
Total number of messages: 5572
Total number of HAM: 4825
Total number of SPAM: 747
Average number of words per message: 8.979720028715004
Most frequent words:
u       1212
call     693
go       459
get      458
ur       385
Name: count, dtype: int64
Number of words that only appear once: 2984
NLTK Processing Time: 1.7830605506896973


### Stemming Comparison

There is only one way to stem the text. Only with NLTK.
We can see that stemming refers to a token different from lemmatization.
We can see that in the most frequent words, the order of them and the number of their appearance is different after the stemming compared to after lemmatization.

# Web Scraping

In [46]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Neuro-linguistic_programming'

response = requests.get(url)
soup_text = ''
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all text from paragraph tags
    paragraphs = soup.find_all('p')
    for para in paragraphs:
        soup_text += para.get_text()
        #print(para.get_text())
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

soup_text

'\nNeuro-linguistic programming (NLP) is a pseudoscientific approach to communication, personal development and psychotherapy, that first appeared in Richard Bandler and John Grinder\'s 1975 book The Structure of Magic I. NLP asserts that there is a connection between neurological processes, language and acquired behavioral patterns, and that these can be changed to achieve specific goals in life.[1][2] According to Bandler and Grinder, NLP can treat problems such as phobias, depression, tic disorders, psychosomatic illnesses, near-sightedness,[a] allergy, the common cold,[a] and learning disorders,[3][4] often in a single session. They also say that NLP can model the skills of exceptional people, allowing anyone to acquire them.[5][b]\nNLP has been adopted by some hypnotherapists as well as by companies that run seminars marketed as leadership training to businesses and government agencies.[6][7]\nThere is no scientific evidence supporting the claims made by NLP advocates, and it has 

## Tokenize

In [None]:
nltk.download('punkt')

tokens = nltk.word_tokenize(soup_text)
stopwords = nltk.corpus.stopwords.words('english')
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopwords and token.isalpha()]
filtered_tokens

## Lemmatize

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

## Stem

In [49]:
stemmer = nltk.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
stemmed_tokens

['program',
 'nlp',
 'pseudoscientif',
 'approach',
 'commun',
 'person',
 'develop',
 'psychotherapi',
 'first',
 'appear',
 'richard',
 'bandler',
 'john',
 'grinder',
 'book',
 'structur',
 'magic',
 'nlp',
 'assert',
 'connect',
 'neurolog',
 'process',
 'languag',
 'acquir',
 'behavior',
 'pattern',
 'chang',
 'achiev',
 'specif',
 'goal',
 'life',
 'accord',
 'bandler',
 'grinder',
 'nlp',
 'treat',
 'problem',
 'phobia',
 'depress',
 'tic',
 'disord',
 'psychosomat',
 'ill',
 'allergi',
 'common',
 'cold',
 'learn',
 'disord',
 'often',
 'singl',
 'session',
 'also',
 'say',
 'nlp',
 'model',
 'skill',
 'except',
 'peopl',
 'allow',
 'anyon',
 'acquir',
 'b',
 'nlp',
 'adopt',
 'hypnotherapist',
 'well',
 'compani',
 'run',
 'seminar',
 'market',
 'leadership',
 'train',
 'busi',
 'govern',
 'agenc',
 'scientif',
 'evid',
 'support',
 'claim',
 'made',
 'nlp',
 'advoc',
 'call',
 'pseudosci',
 'scientif',
 'review',
 'shown',
 'nlp',
 'base',
 'outdat',
 'metaphor',
 'brain',
 '

In [50]:
print('Statistics before text processing\n')
print_statistics_on_text(soup_text.split())
print('\nStatistics after text processing')
print('\nStatistics after tokenize\n')
print_statistics_on_text(filtered_tokens)
print('\nStatistics after lemmatize\n')
print_statistics_on_text(lemmatized_tokens)
print('\nStatistics after stem\n')
print_statistics_on_text(stemmed_tokens)

Statistics before text processing

Total number of words: 3567
Most frequent words:
and: 156
the: 139
of: 122
to: 108
a: 84

Statistics after text processing

Statistics after tokenize

Total number of words: 2008
Most frequent words:
nlp: 97
bandler: 40
grinder: 36
scientific: 13
client: 13

Statistics after lemmatize

Total number of words: 2008
Most frequent words:
nlp: 97
bandler: 40
grinder: 36
scientific: 13
client: 13

Statistics after stem

Total number of words: 2008
Most frequent words:
nlp: 97
bandler: 40
grinder: 36
use: 15
model: 13


# WhatsApp Analysis

## Read and arrange WhatsApp txt file

In [78]:
import re

with open("/content/_chat.txt", encoding="utf-8") as f:
  lines = f.readlines()

str_text = ""
for line in lines:
  # Extract the text after the name and colon
  text = line[23:] #removing datetime
  text = re.sub(r'^.*?:', '', text)
  if re.search(r'[a-zA-Z]', text):
    continue
  text = str(text).strip()
  str_text += text

print(str_text)


מה נשמע דין שמי אלחנן עזריאל מה נשמע הבנתי שנתאימו בנינו לחונכותאהלן מה קורה אחי? כן ראיתיאתה מתחיל את הקורס כאיךו הסמסטר?הבנתי אותךךאתה רוצה להתחיל את הסמסטר ואז נקבע? רוצה לקבוע מעכשיו? איך שנוח לךמשנה לך אם זום או פרונטלי?אין לי בעיה לנסות בזום,פשוט מקוןה שזה לא יקשה עם המחשב מה שאתה חושב שנכוןסבבה אז ננסה בזום ונראה איך הולךרביעי וחמישי אני יןתר פנוי יחסיתסגור האמתי יכול ליהות האמתי נוח בחמישי אחהצחמישי ב15 מתאים?אמממ האמתי שיכול ליהות נחמד קצת יותר מאוחר 16:00 אוליכילו אני מסיים פיזיקה בשתיים ואני אוהס לשבת אחרי לסכם מחדש תחומר לעבור עליוובבקרים אתה יכול? רביעי או חמישי?האמתי כן אני יכול חמישירביעי עמוס קצתאז חמישי ? מתי נוח?האמתי שניראה לי הכי נכון, 8 או 9 אם זה בזום כי ב12 אני צריך ליהות כבר במכללה להרצאה בפיזיקהסבבה לי מתאיםך תמיד אפשר לשנותכן אני כרגע פשוט מעדיף לדעת בשביל לסדר את זה עם התוכנית שיקום הטיפולים והלימודיםותודה לךלי יש גם שיעור ב12, מאמין שלפחות בחלק מהפעמים אהיה במכללהיאלה מעולה אז אפשר לפעמיים גם לתאם שםסגוראתה רוצה להתחיל מעוד שבועיים? או מהשבוע?מתי ההרצאות שלך

## Tokenize

In [79]:
from spacy.lang.he import Hebrew
nlp = Hebrew()
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(str_text)
print(list(tokens))

[מה, נשמע, דין, שמי, אלחנן, עזריאל, מה, נשמע, הבנתי, שנתאימו, בנינו, לחונכותאהלן, מה, קורה, אחי?, כן, ראיתיאתה, מתחיל, את, הקורס, כאיךו, הסמסטר?הבנתי, אותךךאתה, רוצה, להתחיל, את, הסמסטר, ואז, נקבע?, רוצה, לקבוע, מעכשיו?, איך, שנוח, לךמשנה, לך, אם, זום, או, פרונטלי?אין, לי, בעיה, לנסות, בזום,פשוט, מקוןה, שזה, לא, יקשה, עם, המחשב, מה, שאתה, חושב, שנכוןסבבה, אז, ננסה, בזום, ונראה, איך, הולךרביעי, וחמישי, אני, יןתר, פנוי, יחסיתסגור, האמתי, יכול, ליהות, האמתי, נוח, בחמישי, אחהצחמישי, ב15, מתאים?אמממ, האמתי, שיכול, ליהות, נחמד, קצת, יותר, מאוחר, 16:00, אוליכילו, אני, מסיים, פיזיקה, בשתיים, ואני, אוהס, לשבת, אחרי, לסכם, מחדש, תחומר, לעבור, עליוובבקרים, אתה, יכול?, רביעי, או, חמישי?האמתי, כן, אני, יכול, חמישירביעי, עמוס, קצתאז, חמישי, ?, מתי, נוח?האמתי, שניראה, לי, הכי, נכון,, 8, או, 9, אם, זה, בזום, כי, ב12, אני, צריך, ליהות, כבר, במכללה, להרצאה, בפיזיקהסבבה, לי, מתאיםך, תמיד, אפשר, לשנותכן, אני, כרגע, פשוט, מעדיף, לדעת, בשביל, לסדר, את, זה, עם, התוכנית, שיקום, הטיפולים, והלימודיםותודה, לךלי,

## Lemmatize

In [80]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token.text) for token in tokens]
print(lemmatized_tokens)

['מה', 'נשמע', 'דין', 'שמי', 'אלחנן', 'עזריאל', 'מה', 'נשמע', 'הבנתי', 'שנתאימו', 'בנינו', 'לחונכותאהלן', 'מה', 'קורה', 'אחי?', 'כן', 'ראיתיאתה', 'מתחיל', 'את', 'הקורס', 'כאיךו', 'הסמסטר?הבנתי', 'אותךךאתה', 'רוצה', 'להתחיל', 'את', 'הסמסטר', 'ואז', 'נקבע?', 'רוצה', 'לקבוע', 'מעכשיו?', 'איך', 'שנוח', 'לךמשנה', 'לך', 'אם', 'זום', 'או', 'פרונטלי?אין', 'לי', 'בעיה', 'לנסות', 'בזום,פשוט', 'מקוןה', 'שזה', 'לא', 'יקשה', 'עם', 'המחשב', 'מה', 'שאתה', 'חושב', 'שנכוןסבבה', 'אז', 'ננסה', 'בזום', 'ונראה', 'איך', 'הולךרביעי', 'וחמישי', 'אני', 'יןתר', 'פנוי', 'יחסיתסגור', 'האמתי', 'יכול', 'ליהות', 'האמתי', 'נוח', 'בחמישי', 'אחהצחמישי', 'ב15', 'מתאים?אמממ', 'האמתי', 'שיכול', 'ליהות', 'נחמד', 'קצת', 'יותר', 'מאוחר', '16:00', 'אוליכילו', 'אני', 'מסיים', 'פיזיקה', 'בשתיים', 'ואני', 'אוהס', 'לשבת', 'אחרי', 'לסכם', 'מחדש', 'תחומר', 'לעבור', 'עליוובבקרים', 'אתה', 'יכול?', 'רביעי', 'או', 'חמישי?האמתי', 'כן', 'אני', 'יכול', 'חמישירביעי', 'עמוס', 'קצתאז', 'חמישי', '?', 'מתי', 'נוח?האמתי', 'שניראה', 'לי', 'הכי',

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Stem

In [81]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_tokens = [ps.stem(token.text) for token in tokens]
print(stemmed_tokens)

['מה', 'נשמע', 'דין', 'שמי', 'אלחנן', 'עזריאל', 'מה', 'נשמע', 'הבנתי', 'שנתאימו', 'בנינו', 'לחונכותאהלן', 'מה', 'קורה', 'אחי?', 'כן', 'ראיתיאתה', 'מתחיל', 'את', 'הקורס', 'כאיךו', 'הסמסטר?הבנתי', 'אותךךאתה', 'רוצה', 'להתחיל', 'את', 'הסמסטר', 'ואז', 'נקבע?', 'רוצה', 'לקבוע', 'מעכשיו?', 'איך', 'שנוח', 'לךמשנה', 'לך', 'אם', 'זום', 'או', 'פרונטלי?אין', 'לי', 'בעיה', 'לנסות', 'בזום,פשוט', 'מקוןה', 'שזה', 'לא', 'יקשה', 'עם', 'המחשב', 'מה', 'שאתה', 'חושב', 'שנכוןסבבה', 'אז', 'ננסה', 'בזום', 'ונראה', 'איך', 'הולךרביעי', 'וחמישי', 'אני', 'יןתר', 'פנוי', 'יחסיתסגור', 'האמתי', 'יכול', 'ליהות', 'האמתי', 'נוח', 'בחמישי', 'אחהצחמישי', 'ב15', 'מתאים?אמממ', 'האמתי', 'שיכול', 'ליהות', 'נחמד', 'קצת', 'יותר', 'מאוחר', '16:00', 'אוליכילו', 'אני', 'מסיים', 'פיזיקה', 'בשתיים', 'ואני', 'אוהס', 'לשבת', 'אחרי', 'לסכם', 'מחדש', 'תחומר', 'לעבור', 'עליוובבקרים', 'אתה', 'יכול?', 'רביעי', 'או', 'חמישי?האמתי', 'כן', 'אני', 'יכול', 'חמישירביעי', 'עמוס', 'קצתאז', 'חמישי', '?', 'מתי', 'נוח?האמתי', 'שניראה', 'לי', 'הכי',

In [None]:
print('Statistics before text processing\n')
print_statistics_on_text(str_text.split())
print('\nStatistics after text processing')
print('\nStatistics after tokenize\n')
print_statistics_on_text(tokens)
print('\nStatistics after lemmatize\n')
print_statistics_on_text(lemmatized_tokens)
print('\nStatistics after stem\n')
print_statistics_on_text(stemmed_tokens)