<a href="https://colab.research.google.com/github/DeanAvram/Text-Processing/blob/main/Text_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading & Basic Analysis

In [None]:
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.corpus import stopwords
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import time



sms = pd.read_csv("/content/spam.csv", encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']
sms.head()

In [29]:
!python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
def clean(text):
  sms = re.sub('[^a-zA-Z]', ' ', text) #Replacing all non-alphabetic characters with a space
  sms = sms.lower() #converting to lowecase
  #sms = sms.split()
  #sms = ' '.join(sms)
  return sms
sms['message'] = sms['message'].apply(clean)

In [31]:
def remove_stopwords(text):
  stop_words = nltk.corpus.stopwords.words('english')
  #str_text = [str(word) for word in text]
  #filtered_text = list(filter(lambda word: word not in stop_words, str_text))
  filtered_text = list(filter(lambda x: (not isinstance(x, spacy.tokens.token.Token) and x not in stop_words) or (isinstance(x, spacy.tokens.token.Token) and x.text not in stop_words), text))
  #print('\n')
  return filtered_text

In [32]:
def most_frequent_words(df, col_name):
  all_text = ' '.join(df[col_name].astype(str).tolist())
  words = re.findall(r'\b\w+\b', all_text.lower())
  words_series = pd.Series(words)
  word_counts = words_series.value_counts()
  return word_counts

In [33]:
def print_statistics_on_df(df, col_name):
  total_sms = df.shape[0]
  ham_count = df['label'].value_counts()['ham']
  spam_count = df['label'].value_counts()['spam']
  num_words = df[col_name].apply(lambda x: len(x) if isinstance(x, (list, spacy.tokens.doc.Doc)) else len(x.split()))
  frequent_words = most_frequent_words(df, col_name)
  unique_words = frequent_words[frequent_words == 1].count()

  print(f"Total number of messages: {total_sms}")
  print(f"Total number of HAM: {ham_count}")
  print(f"Total number of SPAM: {spam_count}")
  print(f"Average number of words per message: {np.mean(num_words)}")
  print(f"Most frequent words:\n{frequent_words.head(5)}")
  print(f"Number of words that only appear once: {unique_words}")

In [34]:
from collections import Counter


def print_statistics_on_text(text: list):
  total_words = len(text)
  word_counts = Counter(text)
  most_frequent_words = word_counts.most_common(5)


  print(f"Total number of words: {total_words}")
  print("Most frequent words:")
  for word, count in most_frequent_words:
    print(f"{word}: {count}")

In [None]:
print_statistics_on_df(sms, 'message')

# Text Processing

## Tokenize

### NLTK

In [None]:
nltk.download('punkt')

start_time = time.time()
sms['nltk_tokenize_message'] = sms['message'].apply(lambda x: remove_stopwords(nltk.word_tokenize(x)))
nltk_tokenize_time = time.time() - start_time
sms

### SpaCy

In [None]:
nlp = English()

tokenizer = Tokenizer(nlp.vocab)
start_time = time.time()
sms['sapcy_tokenize_message'] = sms['message'].apply(lambda x: remove_stopwords(tokenizer(x)))
spacy_tokenize_time = time.time() - start_time
sms

### Tokenization Statistics

In [None]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_tokenize_message')
print(f"NLTK Processing Time: {nltk_tokenize_time}")
print("\n\nspacy statistics")
print_statistics_on_df(sms, 'sapcy_tokenize_message')
print(f"SpaCy Processing Time: {spacy_tokenize_time}")

### Tokenization Comparison

We can see that the words statistics are almost the same. It says that the tokenization process tokenizes the text almost to the same tokens in both techniques.
The NLTK tokenization splits the text into a list of tokens, in contrast to the SpaCy process that produces elements of Token objects.
The processing time is almost the same.

## Lemmatize

### NLTK

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
start_time = time.time()
sms['nltk_lemmatize_message'] = sms['nltk_tokenize_message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
nltk_lemmatize_time = time.time() - start_time
sms

### SpaCy

In [None]:
nlp = spacy.load('en_core_web_sm')
start_time = time.time()
sms['spacy_lemmatize_message'] = sms['sapcy_tokenize_message'].apply(lambda x: ' '.join(s.text for s in x))
sms['spacy_lemmatize_message'] = sms['spacy_lemmatize_message'].apply(lambda x: ' '.join([token.lemma_ for token in nlp((x))]).split())
spacy_lemmatize_time = time.time() - start_time
sms

### Lemmatization Statistics

In [None]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_lemmatize_message')
print(f"NLTK Processing Time: {nltk_lemmatize_time}")
print("\n\nspacy statistics")
print_statistics_on_df(sms, 'spacy_lemmatize_message')
print(f"SpaCy Processing Time: {spacy_lemmatize_time}")

### Lemmatization Comparison

After the lemmatization, we can see that the word statistics is a bit different between the two methods. With NLTK, there are fewer tokens, which means that NLTK refers to more words as the same lemma. Also, the frequent words appear in different amounts in each technique. That means that the two techniques refer to tokens a bit differently.
The processing time is much better with NLTK, as processing with SpaCy took significantly more time.

## Stem

### NLTK

In [None]:
nltk.download('punkt')
stemmer = nltk.PorterStemmer()
start_time = time.time()
sms['nltk_stem_message'] = sms['nltk_tokenize_message'].apply(lambda x: [stemmer.stem(word) for word in x])
nltk_stem_time = time.time() - start_time
sms

### SpaCy

In [25]:
#There is no Stemming library in SpaCy

### Stem Statistics

In [None]:
print("nltk statistics")
print_statistics_on_df(sms, 'nltk_stem_message')
print(f"NLTK Processing Time: {nltk_stem_time}")
#print("\n\nspacy statistics")
#print_statistics_on_df(sms, 'spacy_stem_message')

### Stemming Comparison

There is only one way to stem the text. Only with NLTK.
We can see that stemming refers to a token different from lemmatization.
We can see that in the most frequent words, the order of them and the number of their appearance is different after the stemming compared to after lemmatization.

# Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Neuro-linguistic_programming'

response = requests.get(url)
soup_text = ''
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all text from paragraph tags
    paragraphs = soup.find_all('p')
    for para in paragraphs:
        soup_text += para.get_text()
        #print(para.get_text())
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

soup_text

## Tokenize

In [None]:
nltk.download('punkt')

tokens = nltk.word_tokenize(soup_text)
stopwords = nltk.corpus.stopwords.words('english')
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopwords and token.isalpha()]
filtered_tokens

## Lemmatize

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

## Stem

In [None]:
stemmer = nltk.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
stemmed_tokens

In [None]:
print('Statistics before text processing\n')
print_statistics_on_text(soup_text.split())
print('\nStatistics after text processing')
print('\nStatistics after tokenize\n')
print_statistics_on_text(filtered_tokens)
print('\nStatistics after lemmatize\n')
print_statistics_on_text(lemmatized_tokens)
print('\nStatistics after stem\n')
print_statistics_on_text(stemmed_tokens)

# WhatsApp Analysis

## Read and arrange WhatsApp txt file

In [None]:
import re

with open("/content/_chat.txt", encoding="utf-8") as f:
  lines = f.readlines()

str_text = ""
for line in lines:
  # Extract the text after the name and colon
  text = line[23:] #removing datetime
  text = re.sub(r'^.*?:', '', text)
  if re.search(r'[a-zA-Z]', text):
    continue
  text = str(text).strip()
  str_text += text

print(str_text)


## Tokenize

In [None]:
from spacy.lang.he import Hebrew
nlp = Hebrew()
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(str_text)
print(list(tokens))

## Lemmatize

In [None]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token.text) for token in tokens]
print(lemmatized_tokens)

## Stem

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_tokens = [ps.stem(token.text) for token in tokens]
print(stemmed_tokens)

In [None]:
print('Statistics before text processing\n')
print_statistics_on_text(str_text.split())
print('\nStatistics after text processing')
print('\nStatistics after tokenize\n')
print_statistics_on_text(tokens)
print('\nStatistics after lemmatize\n')
print_statistics_on_text(lemmatized_tokens)
print('\nStatistics after stem\n')
print_statistics_on_text(stemmed_tokens)