## Auto-Filler 

In [1]:
import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk import bigrams, trigrams, LaplaceProbDist, SimpleGoodTuringProbDist
from hazm import sent_tokenize, word_tokenize, Normalizer, stopwords_list
from collections import Counter

In [2]:
comments_df = pd.read_csv(r'Datasets\digikala_comment.csv')
comments_df.head()

Unnamed: 0,comment
0,نسبت به قیمتش ارزش خرید داره\nجاداره، طراحیش ق...
1,چند ماهی میشه که گرفتمش‌. برای برنامه نویسی و ...
2,پراید ستون جدید
3,اقا همه چیش خوبه فقط از پایین زیاد حاشیه داره ...
4,گوسی هو اوی p10 lite سیپیو و دوربین و رمش از ا...


### Pre-Processing Function

In [3]:
def preprocess_sentences(comments_df):
    # Convert comments into sentences
    normalizer = Normalizer()
    comments_df['sentences'] = comments_df['comment'].apply(lambda comment: sent_tokenize(comment))
    comments_df.drop('comment', axis=1, inplace=True)

    # Convert DataFrame into a list of sentences
    sentences_list = comments_df['sentences'].tolist()
    flat_sentences_list = [sentence for sublist in sentences_list for sentence in sublist]

    normalized_sentences = [normalizer.normalize(s) for s in flat_sentences_list]

    # Remove Zero-Width Non-Joiners
    sentences = [sentence.replace('\u200c', ' ') for sentence in normalized_sentences]

    # Remove English characters from sentences
    cleaned_sentences = []
    english_pattern = re.compile(r'[a-zA-Z]')
    for sentence in sentences:
        cleaned_sentence = ' '.join(word for word in word_tokenize(sentence) if not english_pattern.search(word))
        cleaned_sentences.append(cleaned_sentence)

    # Remove punctuations from sentences
    punctuations = string.punctuation + '،' + '؟'
    cleaned_sentences_no_punc = []
    for sentence in cleaned_sentences:
        cleaned_sentence = ''.join(char for char in sentence if char not in punctuations)
        cleaned_sentences_no_punc.append(cleaned_sentence)

    # Remove stop words (Top unigrams are stop words)
    stopwords = stopwords_list()
    cleaned_sentences_no_stopwords = []
    for sentence in cleaned_sentences_no_punc:
        cleaned_sentence = ' '.join(word for word in word_tokenize(sentence) if word not in stopwords)
        cleaned_sentences_no_stopwords.append(cleaned_sentence)

    return cleaned_sentences_no_stopwords


In [4]:
c = preprocess_sentences(comments_df)

In [5]:
print('Number of sententences:', len(c))
for i in range(10):
    print(c[i])

Number of sententences: 463
قیمتش ارزش خرید داره جاداره طراحیش قشنگه مشکلش بندهای ضعیفش هست میشه استحکام چندانی نداشنه باشه
ماهی میشه گرفتمش
برنامه نویسی کارای گرافیکی ازش استفاده
واقعا بگین عالیه
پراید ستون
اقا چیش خوبه پایین حاشیه داره روشن گوشی میشه
نکته دیگه خاطر اطرافش یه کوچلو خمیده هست گلس یه مدتی جدا مشیه
قیمت گوشی هست چی داره دوربین رم یو گرافیک حسگر های مختلف چیزای دیگه
گوسی هو اوی ۱۰ سیپیو دوربین رمش بهتره خودتون میتونین برین مقایسه های ۱۰ گوشیو ببینین
چادر سبک زیباییه دوختشم عالیه


### Language Modeling

In [6]:
def extract_ngrams(sentences):
    # Tokenize sentences into words
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    
    unigrams = [word for sentence in tokenized_sentences for word in sentence]
    bigrams_list = list(bigrams(unigrams))
    trigrams_list = list(trigrams(unigrams))
    
    return unigrams, bigrams_list, trigrams_list

def count_ngrams(unigrams, bigrams_list, trigrams_list):
    unigram_counts = Counter(unigrams)
    bigram_counts = Counter(bigrams_list)
    trigram_counts = Counter(trigrams_list)
    
    return unigram_counts, bigram_counts, trigram_counts

def report_most_frequent_ngrams(unigram_counts, bigram_counts, trigram_counts):
    print("Most frequent unigrams:")
    for word, count in unigram_counts.most_common(8):
        print(f"{word}: {count}")

    print("\nMost frequent bigrams:")
    for bigram, count in bigram_counts.most_common(8):
        print(f"{' '.join(bigram)}: {count}")

    print("\nMost frequent trigrams:")
    for trigram, count in trigram_counts.most_common(8):
        print(f"{' '.join(trigram)}: {count}")

In [7]:
unigrams_list, bigrams_list, trigrams_list = extract_ngrams(c)
uc, bc, tc = count_ngrams(unigrams_list, bigrams_list, trigrams_list)
report_most_frequent_ngrams(uc, bc, tc)

Most frequent unigrams:
داره: 80
گوشی: 72
هست: 58
کیفیت: 51
های: 49
استفاده: 41
یه: 41
میشه: 38

Most frequent bigrams:
دیجی کالا: 15
ممنون دیجی: 11
راضی هستم: 10
حتما پیشنهاد: 8
شگفت انگیز: 7
ازش استفاده: 6
پیشنهاد ممنون: 6
ارزش خرید: 5

Most frequent trigrams:
حتما پیشنهاد ممنون: 6
ممنون دیجی کالا: 5
پیشنهاد ممنون دیجی: 5
خریدم راضی هستم: 4
ارزش خرید داره: 3
اصلا پیشنهاد نمی: 3
نا امید شدم: 3
کیفیت صفحه نمایش: 3


### Smoothing in N-Grams

### Explanation of Smoothing Techniques in N-grams Language Models

In n-grams language models, it's common to encounter unseen n-grams, i.e., sequences of words that never occurred in the training data. When calculating the probability of such unseen n-grams, their probability would be zero, which could lead to severe issues in the model's performance, especially during evaluation.

To address this problem, smoothing techniques are used. Smoothing assigns a small non-zero probability to unseen n-grams, thereby preventing zero probabilities and making the model more robust. Smoothing techniques distribute the probability mass from observed n-grams to unseen ones in a principled manner.

---

**Laplace (Add-One) Smoothing**

Laplace smoothing, also known as Add-One smoothing, is one of the simplest smoothing techniques. In Laplace smoothing, a count of 1 is added to each observed n-gram count before calculating probabilities. This ensures that no n-gram has zero probability and prevents unseen n-grams from having zero probabilities.

Mathematically, the formula for Laplace smoothing of an n-gram is:

$ P_{\text{Laplace}}(w_n | w_{n-1}) = \frac{{\text{count}(w_{n-1}w_n) + 1}}{{\text{count}(w_{n-1}) + V}}$

Where:
- $\text{count}(w_{n-1}w_n)$ is the count of the n-gram \( $w_{n-1}w_n$ \) in the training data.
- $\text{count}(w_{n-1}) $ is the count of the preceding (n-1)-gram \( $w_{n-1} $\) in the training data.
- $ V $ is the vocabulary size, representing the total number of unique words in the training data.

https://www.nltk.org/api/nltk.probability.LaplaceProbDist.html

---

**Good-Turing Smoothing**

Good-Turing smoothing is a more sophisticated smoothing technique that estimates the probabilities of unseen n-grams based on the observed frequencies of other n-grams. It adjusts the probabilities of unseen n-grams based on the frequencies of seen n-grams with similar frequencies. This technique tends to work well when dealing with sparse data and can provide more accurate estimates than Laplace smoothing.

Good-Turing smoothing uses a statistical method called the Good-Turing frequency estimation to estimate the probability of unseen n-grams. It estimates the probability of an unseen n-gram by considering the frequency of n-grams with similar counts in the training data.

In [8]:
def calculate_probabilities(unigrams, bigrams, trigrams):
    unigram_prob_dist = LaplaceProbDist(nltk.FreqDist(unigrams), bins=len(set(unigrams)))

    bigram_prob_dist = SimpleGoodTuringProbDist(nltk.FreqDist(bigrams))
    trigram_prob_dist = SimpleGoodTuringProbDist(nltk.FreqDist(trigrams))

    return unigram_prob_dist, bigram_prob_dist, trigram_prob_dist

In [9]:
unigram_prob_dist, bigram_prob_dist, trigram_prob_dist = calculate_probabilities(unigrams_list, bigrams_list, trigrams_list)

In [10]:
unigram_prob = unigram_prob_dist.prob('بو')
bigram_prob = bigram_prob_dist.prob(('جاداره', 'طراحیش'))
trigram_prob = trigram_prob_dist.prob(('ارزش', 'خرید', 'داره'))

print(unigram_prob, bigram_prob, trigram_prob)

0.00026219192448872575 1.2534513957088445e-05 0.000217704306341319


### Perplexity

In [11]:
def calculate_perplexity(prob_dist, ngrams):
    likelihoods = [prob_dist.prob(ngram) for ngram in ngrams]
    perplexity = np.prod(likelihoods) ** (-1 / len(ngrams))
    return perplexity


def calculate_log_perplexity(prob_dist, ngrams):
    log_likelihood = sum(-np.log(prob_dist.prob(ngram)) for ngram in ngrams)
    perplexity = log_likelihood / len(ngrams)
    return perplexity


def calculate_perplexity_for_models(unigram_prob_dist, bigram_prob_dist, trigram_prob_dist, sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    perplexities = {'Unigram': [], 'Bigram': [], 'Trigram': []}

    for sentence in tokenized_sentences:
        unigram_perplexity = calculate_log_perplexity(unigram_prob_dist, list(sentence))
        bigram_perplexity = calculate_log_perplexity(bigram_prob_dist, list(bigrams(sentence)))
        trigram_perplexity = calculate_log_perplexity(trigram_prob_dist, list(trigrams(sentence)))
        
        perplexities['Unigram'].append(unigram_perplexity)
        perplexities['Bigram'].append(bigram_perplexity)
        perplexities['Trigram'].append(trigram_perplexity)
    
    return perplexities

In [12]:
test_sentences = [
    "این لپ تاپ سخت افزار خیلی قوی داره و از پس هرکاری به راحتی بر میاد",
    "این ساعت بسیار زیبا طراحی و ساخته شده",
    "یک محصول با کیفیت ایرانی که حقیقتا جای حمایت داره",
    "بوش و ماندگاری خوب هست من خیلی دوستش دارم"
]

preprocessed_test_sentences = preprocess_sentences(pd.DataFrame({'comment': test_sentences}))
perplexities = calculate_perplexity_for_models(unigram_prob_dist, bigram_prob_dist, trigram_prob_dist, preprocessed_test_sentences)

for model, perplexity_list in perplexities.items():
    print(f"Perplexity for {model} Model:")
    for sentence_index, perplexity in enumerate(perplexity_list):
        print(f"Sentence {sentence_index + 1}: {perplexity}")
    print('-------------------------------')


Perplexity for Unigram Model:
Sentence 1: 7.284726350969215
Sentence 2: 7.020115091550757
Sentence 3: 6.349809784308699
Sentence 4: 6.753836492440416
-------------------------------
Perplexity for Bigram Model:
Sentence 1: 8.065693220532186
Sentence 2: 11.287024601969089
Sentence 3: 11.287024601969089
Sentence 4: 10.729894009691739
-------------------------------
Perplexity for Trigram Model:
Sentence 1: 7.496977197363101
Sentence 2: 13.106568675036614
Sentence 3: 13.106568675036614
Sentence 4: 13.106568675036614
-------------------------------


### Word Prediction

In [13]:
import random

def unigram_predict(sent, freq_dist, max_length=12):
    while len(sent.split()) < max_length:
        word = freq_dist.generate()
        sent = sent + " " + word
    return sent


def bigram_predict(sent, freq_dist, max_length=12):
    pass
    

def trigram_predict(sent, freq_dist, max_length=12):
    pass


# Test the function
test_sentences = [
    "کیفیت محصولات چینی زرین",
    "از لحاظ جنس جنس خوبی داره",
    "حتما پیشنهاد میکنم",
    "بعد از چند روز استفاده"
]

print('Unigram:')
for sent in test_sentences:
    sentence = unigram_predict(sent, unigram_prob_dist)
    print(sentence)

print('Bigram:')
for sent in test_sentences:
    sentence = bigram_predict(sent, bigram_prob_dist)
    print(sentence)

print('Trigram:')
for sent in test_sentences:
    sentence = trigram_predict(sent, trigram_prob_dist)
    print(sentence)


Unigram:
کیفیت محصولات چینی زرین نکرد نشدن نمیشه امکانات صورت قطعأ های فدق
از لحاظ جنس جنس خوبی داره گوشی بدین شیک یاب باش پراید
حتما پیشنهاد میکنم بهتربن روز گوشی کامنت میشه منتظر نداره ۸ کار
بعد از چند روز استفاده صفر ارزان آمپر بوی منم ایکاش حساب
Bigram:
None
None
None
None
Trigram:
None
None
None
None
