# Part 1: Rule-Based NLP and Regex

In this part, i will use Regex to generate a bill from a given text input.


In [None]:
text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"


In [None]:
import re
from nltk.corpus import stopwords

In [None]:
word_to_numbers = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10
}

In [None]:
tokens = re.split(r'(?<!\d),(?! \d)|(?<!\d) and', text)
result = []
total_bill = 0 

In [None]:
for token in tokens:
        # Replace word numbers with their numeric equivalents
        token = ' '.join([str(word_to_numbers.get(word.strip().lower(), word.strip())) 
                          for word in token.split() 
                          if word.lower() not in ['bought', 'kilos', 'each', 'purchased']])
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        token = ' '.join([word for word in token.split() if word.lower() not in stop_words])
        result.append(token)

In [None]:
print("The Bill:")
print("{:<20} {:<10} {:<10} {:<10}".format("Product", "Quantity", "Unit Price", "Total Price"))
for item in result:
        match = re.match(r'(\d+(?:,\d+)*(?:\.\d+)?) (.+?) (\d+(?:,\d+)*(?:\.\d+)?)', item)
        if match:
            quantity, product, unit_price = match.groups()
            quantity = float(quantity.replace(',', ''))  # Convert quantity to float
            unit_price = float(unit_price.replace(',', '.'))  # Convert unit price to float
            total_price = quantity * unit_price  # Calculate total price
            total_bill += total_price  # Add to the total bill
            # Print the details of the current item
            print("{:<20} {:<10} {:<10} {:<10}".format(product, quantity, unit_price, total_price))

# Part 2: word Embedding

In [151]:
import pymongo
# Connect to MongoDB
client = pymongo.MongoClient('localhost', 27017)
db = client['web_scraping']
collection = db['aljazeera_events']

# Fetch content from MongoDB
cursor = collection.find()
titles_list = []

In [152]:
count = 0
for document in cursor:
    if count >= 7:
        break  # Exit the loop if we've processed 6 contents
    title = document.get('title')
    titles_list.append(title)
    count += 1

In [153]:
print(titles_list)
print("the title:",title)


['مهرجان الجزيرة بلقان للأفلام الوثائقية', 'منتدى الجزيرة الخامس عشر', 'منتدى كليات الصحافة في العالم العربي', 'مهرجان الجزيرة بلقان السادس للأفلام الوثائقية', 'الجزيرة للدراسات يبحث جدوى انخراط فلسطينيّي الداخل في مؤسسة الحكم الإسرائيلية', 'معرض اكتشف الجزيرة', 'الجزيرة تحتفل بمرور 25 عاماً على انطلاقتها']
the title: الجزيرة تحتفل بمرور 25 عاماً على انطلاقتها


## One-Hot Encoding, Bag of Words, TF-IDF:

In [172]:
import json
import re
import string
from nltk.tokenize import word_tokenize
from word2number import w2n
from nltk.corpus import stopwords
import pyarabic.araby as araby
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec
from gensim.models import fasttext
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# from bidi.algorithm import get_display
# import arabic_reshaper

ImportError: cannot import name 'triu' from 'scipy.linalg.special_matrices' (C:\Users\Dell PC\PycharmProjects\atelier1\venv\Lib\site-packages\scipy\linalg\special_matrices.py)

In [165]:
ar_punct = ''')(+`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”،.”…“–ـ”.'''
en_punct = string.punctuation
punct_lst = ar_punct + en_punct
unique_words = set()
for title in titles_list:
    for word in araby.tokenize(title):
        if word not in stopwords.words('arabic') and word not in punct_lst and word.isdigit() == False:
            unique_words.add(word)
print(f"Unique words: {unique_words}")
print(f"Number of unique words: {len(unique_words)}")


Unique words: {'الوثائقية', 'الجزيرة', 'الحكم', 'يبحث', 'منتدى', 'انطلاقتها', 'عاماً', 'بمرور', 'جدوى', 'تحتفل', 'الصحافة', 'بلقان', 'للدراسات', 'العربي', 'الخامس', 'اكتشف', 'كليات', 'العالم', 'مؤسسة', 'انخراط', 'الإسرائيلية', 'معرض', 'الداخل', 'للأفلام', 'السادس', 'فلسطينيّي', 'مهرجان'}
Number of unique words: 27


In [166]:
one_hot_encoded = []
for word in unique_words:
    encoding = np.zeros(len(unique_words))
    
    index = list(unique_words).index(word)
    
    encoding[index] = 1
    one_hot_encoded.append((word, encoding))
    
df_onehot = pd.DataFrame(one_hot_encoded, columns=['Word', 'One-hot encoding'])
df_onehot

Unnamed: 0,Word,One-hot encoding
0,الوثائقية,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,الجزيرة,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,الحكم,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,يبحث,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,منتدى,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
5,انطلاقتها,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
6,عاماً,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
7,بمرور,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
8,جدوى,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
9,تحتفل,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [167]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(titles_list)
df_BoW = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_BoW.head()

Unnamed: 0,25,اكتشف,الإسرائيلية,الجزيرة,الحكم,الخامس,الداخل,السادس,الصحافة,العالم,...,فلسطيني,في,كليات,للأفلام,للدراسات,مؤسسة,معرض,منتدى,مهرجان,يبحث
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,1,1,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,1,1,1,0,1,0,0,0,...,1,1,0,0,1,1,0,0,0,1


In [168]:
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(titles_list)
tf_idf_array = tf_idf_vector.toarray()
df_tf_idf = pd.DataFrame(tf_idf_array, columns = tr_idf_model.get_feature_names_out())
df_tf_idf.head()

Unnamed: 0,25,اكتشف,الإسرائيلية,الجزيرة,الحكم,الخامس,الداخل,السادس,الصحافة,العالم,...,فلسطيني,في,كليات,للأفلام,للدراسات,مؤسسة,معرض,منتدى,مهرجان,يبحث
0,0.0,0.0,0.0,0.275087,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.48071,0.0,0.0,0.0,0.0,0.48071,0.0
1,0.0,0.0,0.0,0.278236,0.0,0.585739,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.486214,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.431207,0.431207,...,0.0,0.357939,0.431207,0.0,0.0,0.0,0.0,0.357939,0.0,0.0
3,0.0,0.0,0.0,0.238051,0.0,0.0,0.0,0.501141,0.0,0.0,...,0.0,0.0,0.0,0.41599,0.0,0.0,0.0,0.0,0.41599,0.0
4,0.0,0.0,0.317585,0.150859,0.317585,0.0,0.317585,0.0,0.0,0.0,...,0.317585,0.263623,0.0,0.0,0.317585,0.317585,0.0,0.0,0.0,0.317585


In [169]:
tokenized_corpus = []
for sentence in titles_list:
    tokenized_sentence = [word for word in araby.tokenize(sentence) if (word not in stopwords.words('arabic')) and (word.isdigit() == False) and (word not in ['.', ',', '!', '?', ':', ';', '،', '؟', '؛'])]
    tokenized_corpus.append(tokenized_sentence)

In [170]:
print(f"Tokenized corpus: {tokenized_corpus}")

Tokenized corpus: [['مهرجان', 'الجزيرة', 'بلقان', 'للأفلام', 'الوثائقية'], ['منتدى', 'الجزيرة', 'الخامس'], ['منتدى', 'كليات', 'الصحافة', 'العالم', 'العربي'], ['مهرجان', 'الجزيرة', 'بلقان', 'السادس', 'للأفلام', 'الوثائقية'], ['الجزيرة', 'للدراسات', 'يبحث', 'جدوى', 'انخراط', 'فلسطينيّي', 'الداخل', 'مؤسسة', 'الحكم', 'الإسرائيلية'], ['معرض', 'اكتشف', 'الجزيرة'], ['الجزيرة', 'تحتفل', 'بمرور', 'عاماً', 'انطلاقتها']]


In [171]:
cbow_w2v_model = word2vec.Word2Vec(tokenized_corpus, vector_size=100, window=30,
                            min_count=1, sample=1e-3, sg=0, epochs=1000)

NameError: name 'word2vec' is not defined