<a href="https://colab.research.google.com/github/Danielberin23/NLP_Exercise2/blob/main/NLP_ex2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Enviorment

In [2]:
!pip install nltk spacy gensim scikit-learn numpy




# **Imports**

In [15]:
import os
import numpy as np
from google.colab import drive
import pandas as pd
from collections import Counter
import nltk
import spacy
from time import time
from spacy.lang.he import Hebrew
from gensim.models import Word2Vec
from spacy.tokenizer import Tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Load Google Drive**


In [4]:
drive.mount('/content/drive')
GOOGLE_DRIVE_PATH = '/content/drive/MyDrive'
print(os.listdir('/content/drive/MyDrive'))

Mounted at /content/drive
['Photos', 'Books', 'Android games', 'תמונה (55).jpg', 'nativ', 'קנבס לגף אוהד.png', 'samples.csv', 'Selected issues in enginnering ethics', 'Colab Notebooks', 'Copy of Warframe Mastery Checklist V2.gsheet', 'Controls and compliance checklist.gdoc', 'Botium Toys: Scope, goals, and risk assessment report.gdoc', 'Cybersecurity incident report network traffic analysis.gdoc', 'Example of a Cybersecurity Incident Report.gdoc', 'spam.csv', 'homework1-2324.pdf', 'WhatsApp Chat with Friends.txt']


# Set Corpus path

In [5]:
file_path = '/content/drive/MyDrive/WhatsApp Chat with Friends.txt'

In [6]:
#load file to text

with open(file_path, 'r') as f:
    text = f.read().replace('\n', ' ')


Applying Tokenization

In [7]:
# Load the Hebrew language model
nlp = Hebrew()

# a. White space tokenizer
white_space_tokenizer = Tokenizer(nlp.vocab, token_match=None)
white_space_tokens = white_space_tokenizer(text)

# Print basic statistics
print("White space tokenizer:")
print("- Number of tokens:", len(white_space_tokens))
print("- Most frequent tokens:", Counter(white_space_tokens).most_common(5))

# b. Regex tokenizer by digits
regex_tokenizer = nltk.RegexpTokenizer(r'\[\d{2}:\d{2},\s\d{2}/\d{2}/\d{4}\]')
regex_tokens = regex_tokenizer.tokenize(text)

# Print basic statistics
print("\nRegex tokenizer:")
print("- Number of tokens:", len(regex_tokens))
print("- Most frequent tokens:", Counter(regex_tokens).most_common(5))

# c. Word tokenizer
word_tokenizer = nltk.word_tokenize(text)

# Print basic statistics
print("\nWord tokenizer:")
print("- Number of tokens:", len(word_tokenizer))
print("- Most frequent tokens:", Counter(word_tokenizer).most_common(5))

# d. Sentence tokenizer
sentence_tokenizer = nltk.sent_tokenize(text)

# Print basic statistics
print("\nSentence tokenizer:")
print("- Number of sentences:", len(sentence_tokenizer))
print("- Most frequent sentences:", Counter(sentence_tokenizer).most_common(5))


White space tokenizer:
- Number of tokens: 13009
- Most frequent tokens: [([17:57,, 1), (28/05/2024], 1), (+972, 1), (53-235-5578:, 1), (מליאת, 1)]

Regex tokenizer:
- Number of tokens: 1166
- Most frequent tokens: [('[18:34, 10/06/2024]', 14), ('[14:05, 18/05/2024]', 12), ('[20:16, 13/06/2024]', 10), ('[14:04, 12/05/2024]', 10), ('[20:18, 13/06/2024]', 9)]

Word tokenizer:
- Number of tokens: 18419
- Most frequent tokens: [(',', 1432), (':', 1259), ('[', 1166), (']', 1166), ('Daniel', 510)]

Sentence tokenizer:
- Number of sentences: 339
- Most frequent sentences: [('ארוחת צהריים תוגש לאחר השעה 13:00 לרשומים ליום מלא.', 3), ('[17:57, 28/05/2024] +972 53-235-5578: מליאת מועצת העיר מן המניין תחל הערב, שלישי ה - 28.5, בשעה 18:00  ותשודר בשידור ישיר בעמוד הפייסבוק ובערוץ היוטיוב העירוני.', 1), ('הצטרפו אלינו:  לצפייה בפייסבוק >> https://bit.ly/3R5hs6x  לצפייה ביוטיוב >> https://bit.ly/4bwYq16  כבר מתחילים!', 1), ('הישארו מעודכנים, הצטרפו לווטסאפ של עיריית נס ציונה - https://nzc.toshavil.c

# Normalize the Corpus Stemming and Lemmatization

In [8]:
# prompt: Apply Normalization:
# a. Stemming
# b. Lemmatization

# a. Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_tokens = [stemmer.stem(token) for token in word_tokenizer]

# Print basic statistics
print("\nStemmed tokens:")
print("- Number of tokens:", len(stemmed_tokens))
print("- Most frequent tokens:", Counter(stemmed_tokens).most_common(5))

# b. Lemmatization
from spacy.lang.he import Hebrew

nlp = Hebrew()

lemmatized_tokens = [token.lemma_ for token in nlp(text)]

# Print basic statistics
print("\nLemmatized tokens:")
print("- Number of tokens:", len(lemmatized_tokens))
print("- Most frequent tokens:", Counter(lemmatized_tokens).most_common(5))



Stemmed tokens:
- Number of tokens: 18419
- Most frequent tokens: [(',', 1432), (':', 1259), ('[', 1166), (']', 1166), ('daniel', 510)]

Lemmatized tokens:
- Number of tokens: 20230
- Most frequent tokens: [('', 20230)]


# Applying feature extraction

In [9]:


# a. BOW
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

bow_matrix = vectorizer.fit_transform(sentence_tokenizer)

# Print the shape of the BOW matrix
print("\nBOW matrix shape:", bow_matrix.shape)

# b. TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(sentence_tokenizer)

# Print the shape of the TF-IDF matrix
print("\nTF-IDF matrix shape:", tfidf_matrix.shape)

# c. Word embedding by WORD2VEC
sentences = [sentence.split() for sentence in sentence_tokenizer]

# Train the Word2Vec model
model = Word2Vec(
    sentences=sentences,      # The corpus to train the model on
    vector_size=100,       # The size of the word vectors to be learned
    window=5,              # The size of the window of words to be considered
    min_count=5,           # The minimum frequency required for a word to be included in the vocabulary
    sg=0,                  # 0 for CBOW, 1 for skip-gram
    negative=5,            # The number of negative samples to use for negative sampling
    ns_exponent=0.75,      # The exponent used to shape the negative sampling distribution
    alpha=0.03,            # The initial learning rate
    min_alpha=0.0007,      # The minimum learning rate to which the learning rate will be linearly reduced
    epochs=30,             # The number of epochs (iterations) over the corpus
    workers=4,             # The number of worker threads to use for training the model
    seed=42,               # The seed for the random number generator
    max_vocab_size=None    # The maximum vocabulary size (None means no limit)
)


# Get the vector representation of a word
vector = model.wv['ערב']

# Find the most similar words to a given word
similar_words = model.wv.most_similar('ערב')

# Print the vector and similar words
print("Vector for 'ערב':", vector)
print("Most similar words to 'ערב':", similar_words)



BOW matrix shape: (339, 3296)

TF-IDF matrix shape: (339, 3296)
Vector for 'ערב': [ 7.60411248e-02 -2.13700846e-01 -7.89538473e-02  3.59334469e-01
  2.09458485e-01 -3.08486857e-02  1.54752672e-01  3.39391947e-01
  7.53021538e-02  3.59124243e-01 -2.40155548e-01  3.36234748e-01
  1.74592212e-01 -2.17623338e-01 -2.99412936e-01 -3.26264620e-01
 -6.87723374e-03 -7.56219476e-02  1.35136796e-02 -4.61147487e-01
  4.51603644e-02  6.58421591e-02  2.86495477e-01  1.70460746e-01
 -1.21021301e-01  4.38376702e-02 -3.61784130e-01  2.73386270e-01
 -2.71246463e-01 -1.91113457e-01  1.65447533e-01  6.04906641e-02
 -1.25873154e-02  2.53482014e-01 -7.00885877e-02 -1.04880385e-01
 -4.51149553e-01  4.28623408e-02  2.17381805e-01  1.01758309e-01
 -2.35917583e-01  1.46730170e-01  1.74670011e-01 -1.96969122e-01
  6.31330907e-02 -4.31575254e-02 -2.56253581e-04  5.39896078e-02
  4.47589040e-01  2.88655698e-01 -1.76522717e-01  1.14535047e-02
  2.31523037e-01  8.38189945e-02 -6.82044327e-02  4.93210927e-02
  9.388

# Applying Glove

*In Natural Language Processing (NLP), GloVe stands for "Global Vectors for Word Representation." It is an unsupervised learning algorithm developed at Stanford University for obtaining vector representations (also known as embeddings) for words.*

In [17]:
# prompt: Apply glove training and lose function to the data show, vector for each word , make your own glove function

# Define the glove function
def glove(text, window_size, vector_size, learning_rate, epochs):
  # Tokenize the text
  tokens = nltk.word_tokenize(text)

  # Create a dictionary of word counts
  word_counts = Counter(tokens)

  # Create a list of unique words
  unique_words = list(word_counts.keys())

  # Create a dictionary of word vectors
  word_vectors = {}

  # Initialize the word vectors randomly
  for word in unique_words:
    word_vectors[word] = np.random.uniform(-1, 1, vector_size)

  # Train the word vectors
  for epoch in range(epochs):
    for i in range(len(tokens)):
      # Get the current word
      word = tokens[i]

      # Get the context words
      context_words = []
      for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
        if j != i:
          context_words.append(tokens[j])

      # Calculate the gradient
      gradient = np.zeros(vector_size)
      for context_word in context_words:
        gradient += word_vectors[context_word] - word_vectors[word]

      # Update the word vector
      word_vectors[word] -= learning_rate * gradient

  return word_vectors

# Train the glove model
window_size = 2
vector_size = 10
learning_rate = 0.01
epochs = 10

word_vectors = glove(text, window_size, vector_size, learning_rate, epochs)

for word, vector in word_vectors.items():
    # Limit decimal places for better readability
    formatted_vector = np.around(vector, decimals=3)  # Adjust decimals as needed

    print(f"Vector for '{word}':\n{formatted_vector}")  # Newline for visual clarity


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Vector for 'וכל':
[-8.50879949e+299 -1.11752199e+300  2.29804354e+300  1.47077821e+300
  1.30279002e+300  7.53302953e+299 -2.73064803e+300  1.34124435e+299
 -1.81895352e+300  2.42846473e+300]
Vector for 'התמונות':
[-1.13816619e+288 -1.49483574e+288  3.07394183e+288  1.96736337e+288
  1.74265660e+288  1.00764386e+288 -3.65260843e+288  1.79409443e+287
 -2.43309459e+288  3.24839769e+288]
Vector for 'לקחו':
[ 4.33234313e+286  5.68997865e+286 -1.17007260e+287 -7.48861916e+286
 -6.63328994e+286 -3.83551978e+286  1.39033765e+287 -6.82908411e+285
  9.26138968e+286 -1.23647790e+287]
Vector for 'מאיזה':
[ 2.17533542e+288  2.85702487e+288 -5.87511258e+288 -3.76014964e+288
 -3.33067582e+288 -1.92587285e+288  6.98109690e+288 -3.42898705e+287
  4.65028470e+288 -6.20854368e+288]
Vector for 'אתר':
[ 2.15346277e+288  2.82829795e+288 -5.81603928e+288 -3.72234194e+288
 -3.29718642e+288 -1.90650851e+288  6.91090310e+288 -3.39450915e+287
  4.

# Tagging random 5 sentences

In [71]:
def cyk_parse(sentence, grammar):
    #n = len(sentence)
    #table = [[set() for _ in range(n+1)] for _ in range(n+1)]

    # Step 1: Tokenization
    tokens = sentence.split()
    n = len(tokens)
    table = [[set() for _ in range(n+1)] for _ in range(n+1)]

    # Step 2: Initialization
    for i in range(1, n+1):
        for rule in grammar:
            if rule[1] == tokens[i-1]:
                table[i][i].add(rule[0])

    # Step 3: Rule Application
    for length in range(2, n+1):
        for i in range(1, n-length+2):
            j = i + length - 1
            for k in range(i, j):
                for rule in grammar:
                    if len(rule) == 3:
                        for left in table[i][k]:
                            for right in table[k+1][j]:
                                if rule[1] in left and rule[2] in right:
                                    table[i][j].add(rule[0])

    # Step 4: Backtracking
    if 'S' in table[1][n]:
        return True, table
    else:
        return False, table


# Defined CNF context free grammer for whatsapp using AI
grammar = [
    ("MSG", "GREETING", "CONTENT"),       # Message -> Greeting + Content
    ("MSG", "CONTENT"),                   # Message -> Content (without greeting)

    ("CONTENT", "SENTENCE", "CONTENT"),   # Content -> Sentence + Content (recursive for multiple sentences)
    ("CONTENT", "EMOJI"),                 # Content -> Emoji
    ("CONTENT", "ABBREV"),                # Content -> Abbreviation
    ("CONTENT", "WORD"),                  # Content -> Word

    ("SENTENCE", "NP", "VP"),             # Sentence -> Noun Phrase + Verb Phrase
    ("SENTENCE", "VP"),                   # Sentence -> Verb Phrase (imperative)
    ("NP", "N"),                          # Noun Phrase -> Noun
    ("NP", "ADJ", "N"),                   # Noun Phrase -> Adjective + Noun
    ("VP", "V"),                          # Verb Phrase -> Verb
    ("VP", "V", "NP"),                    # Verb Phrase -> Verb + Noun Phrase

    ("GREETING", "היי"),                   # Greeting -> "Hey"
    ("GREETING", "מה קורה"),               # Greeting -> "What's up?"

    ("N", "מה"),                         # Noun -> "ma" (what)
    ("N", "חדשות"),                     # Noun -> "hadashot" (news)
    ("N", "ארוחת"),                     # Noun -> "aruhat" (meal)

    ("V", "קורה"),                        # Verb -> "koreh" (happening - m.sg.)
    ("V", "קורה"),                        # Verb -> "koreh" (happening - f.sg.)
    ("V", "קוראים"),                      # Verb -> "korim" (happening - pl.)
    ("V", "רוצה"),                        # Verb -> "rotze" (want - m.sg.)
    ("V", "רוצה"),                        # Verb -> "rotza" (want - f.sg.)
    ("V", "רוצים"),                       # Verb -> "rotzim" (want - pl.)

    ("ADJ", "טוב"),                       # Adjective -> "tov" (good - m.sg.)
    ("ADJ", "טובה"),                      # Adjective -> "tova" (good - f.sg.)
    ("ADJ", "טובים"),                     # Adjective -> "tovim" (good - pl.)

    ("EMOJI", "😂"),                      # Emoji -> laughing emoji
    ("EMOJI", "❤️"),                      # Emoji -> heart emoji

    ("ABBREV", "חחח"),                     # Abbreviation -> "hahaha"
    ("ABBREV", "ביי"),                     # Abbreviation -> "bye"
]

# Input sentence to be parsed
random_sentence = np.random.choice(sentence_tokenizer, 5)

for sentence in random_sentence:
  # Call the CYK parser
  parsed, table = cyk_parse(sentence, grammar)

  # Print the parse table and whether the sentence was parsed or not
  if parsed:
      print("Input sentence: ", sentence)
      print("Parse table: ")
      for row in table:
          print(row)
  else:
      print("Input sentence: ", sentence)
      print("Sentence not parsed.")

Input sentence:  [14:37, 17/06/2024] יקיר מדמח הגבר: תצעק עליי מתי שתרצה אבל לא עכשיו אני רואה הקלטות של מכונה וממוחשבת [14:37, 17/06/2024] יקיר מדמח הגבר: עד ה23 [14:37, 17/06/2024] יקיר מדמח הגבר: יש זמן [14:37, 17/06/2024] Daniel: כן כן [14:38, 17/06/2024] Daniel: יש לי עד מחר את העיבוד שפה טבעית [14:38, 17/06/2024] Daniel: כמעט סיימתי, רק צריך לעשות webscraping [14:39, 17/06/2024] יקיר מדמח הגבר: תן בראש [19:08, 19/05/2024] +972 50-865-0825: מישהו מצא לינק למעבדה?
Sentence not parsed.
Input sentence:  [21:52, 05/06/2024] Daniel: כן, אמרו שהוא מבוסס על המעבדות 1 עד 3 [17:02, 09/06/2024] +972 50-402-5106: התחיל את השיעור?
Sentence not parsed.
Input sentence:  אם אתה נותן מספר זה רק ציר 1 -  x [01:06, 17/04/2024] +972 58-730-6070: כנ"ל בבלוק?
Sentence not parsed.
Input sentence:  (בparallel for יש )  גם יכל להיות הדפסה של 40 1 2 3 בעקרון ..  אז כל תשובה של הדפסה אפשרית אמורה להתקבל [10:31, 09/04/2024] +972 52-865-5268: תרשום בגוגל תראה בתמונות אני חושב שתבין הכי טוב [12:01, 11/04/2024