# N-grams Extractor
## Objective: 
- Extract n-grams (bi/tri-grams) from a given group of texts to provide meaningful context into the data observed. 
## Techniques: 
- N-grams extraction 
- Smoothing: Laplace, Kneser-Ney Discounting
- Intrinsic evaluation: perplexity

# A. Import data and libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy

In [14]:
import os
import codecs

# folder_path = "C:/Users/Clarence/Desktop/tm_220923/output"
folder_path = "/Volumes/Samsung USB/spfcase_tm/tm_220923/output"
# output_folder = "C:/Users/Clarence/Desktop/tm_220923/output"
output_folder = "/Volumes/Samsung USB/spfcase_tm/tm_220923/output"

current_filename = None
cleaned_text_list = []
    
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        # Check if the filename has changed
        if current_filename != filename:
            # Clear the cleaned_text_list when the filename changes
            cleaned_text_list.clear()
            current_filename = filename

        # Load the Excel file into a DataFrame
        file_path = os.path.join(folder_path, filename)
        df = pd.read_excel(file_path)

        # Extract sanitized_text column
        sanitized_texts = df['cleanedtext'].dropna().astype(str).tolist()

        # Append to cleaned_text_list
        cleaned_text_list.extend([text for text in sanitized_texts])

        # Export the list as a txt file, with the same file name
        txt_filename = filename.replace(".xlsx", ".txt")
        txt_filepath = os.path.join(output_folder, txt_filename)

        with open(txt_filepath, 'w') as txt_file:
            #convert explicitly to str() to remove weird unicode characters related to \s
            t = str('\n'.join(cleaned_text_list))
            txt_file.write(t)

new_output_folder = os.path.join(output_folder, "ngram-files-021023")
if not os.path.exists(new_output_folder):
    os.makedirs(new_output_folder)

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        txt_filepath = os.path.join(output_folder, filename)
        new_txt_filepath = os.path.join(new_output_folder, filename)
        os.rename(txt_filepath, new_txt_filepath)

In [None]:
folder_path = "/Volumes/Samsung USB/spfcase_tm/tm_220923/output/ngram-files-021023"
for filename in os.listdir(folder_path): 
    file_path = os.path.join(folder_path, filename) 
    df = pd.read_csv(file_path, sep="\n", header=None, names=["text"])
    


## N-Gram Building: Token Dictionary and Probilistic Generation 

In this code section, we look at building the text's token-count dictionary and build a probabilistic model of sequential data. 

In [2]:
import nltk, re, string, collections, math
from nltk.util import ngrams
from collections import Counter

with open("./k12c.txt", 'r', encoding='utf-8') as file: 
    text = file.read() 

#removes punctuation (except periods)
punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"
text = re.sub(punctuationNoPeriod, " ", text)

# print(text[:1000])
tokens = text.split()
enPentaGrams = ngrams(tokens, 5)

enPentaFreq = Counter(enPentaGrams)
enPentaFreq.most_common(100)

#testing for unigram probabilities
# tokenCount = Counter(tokens)
# totalToken = sum(tokenCount.values())
# probResult = {k: v / totalToken for k, v in tokenCount.items()}
# print(probResult)

totalPentaGrams = sum(enPentaFreq.values())
probResult = {k: -math.log(v / totalPentaGrams) for k, v in enPentaFreq.items()}

#gets 100 least common 5-grams most_common()[:-1001:-1] and negative log-transformed values
#least common: not very informative, trying for most common instead
for g, c in enPentaFreq.most_common(100):
    probability = probResult[g]
    print(f"5-gram: {g}, \t Count: {c}, \t Negative LogProbability: {probability}")

5-gram: ('years', 'old', 'date', 'of', 'birth'), 	 Count: 42, 	 Negative LogProbability: 8.580109862351213
5-gram: ('354', 'of', 'the', 'Penal', 'Code'), 	 Count: 41, 	 Negative LogProbability: 8.604207413930274
5-gram: ('Section', '354', 'of', 'the', 'Penal'), 	 Count: 39, 	 Negative LogProbability: 8.654217834504935
5-gram: ('of', 'the', 'Penal', 'Code', 'Chapter'), 	 Count: 39, 	 Negative LogProbability: 8.654217834504935
5-gram: ('the', 'Penal', 'Code', 'Chapter', '224'), 	 Count: 39, 	 Negative LogProbability: 8.654217834504935
5-gram: ('invited', 'to', 'sign', 'below', 'the'), 	 Count: 36, 	 Negative LogProbability: 8.734260542178472
5-gram: ('under', '14', 'years', 'of', 'age'), 	 Count: 31, 	 Negative LogProbability: 8.883792276149435
5-gram: ('do', 'you', 'have', 'to', 'say'), 	 Count: 30, 	 Negative LogProbability: 8.916582098972427
5-gram: ('under', 'Section', '354', 'of', 'the'), 	 Count: 26, 	 Negative LogProbability: 9.0596829426131
5-gram: ('122', 'Bedok', 'North', 'Stre

# N. Kneser-Ney Discounting 
Describes the augmentation of absolute discounting with handling of lower-order unigram distributions. 
KN-D bases the estimate of P(Continuation) on the number of contexts that word _w_ appears in. 
We can express the number of times _w_ appears in some novel continuation as: 
$$P_{CONTINUATION}(w) \propto \: \mid\{v : \: C(vw) > 0\} \mid$$

Implementation for our probabilistic model can be found in `kneserneymodule.py`


In [9]:
from kneserneymodule import KneserNeyLM

# Convert 5-grams to strings for training Kneser-Ney model
enPentaStrings = [' '.join(gram) for gram in enPentaGrams]

# Train Kneser-Ney language model
highest_order = 5  # Change this according to your highest order
kneser_ney_model = KneserNeyLM(highest_order, enPentaStrings)

# Display the top 100 least common 5-grams and their probabilities
for gram, count in enPentaFreq.most_common(1000):
    log_probability = kneser_ney_model._calc_probs([{' '.join(gram): count}])
    print(f"5-gram: {' '.join(gram)}, Count: {count}, Log Probability: {log_probability}")

5-gram: years old date of birth, Count: 42, Log Probability: [{'years old date of birth': 0.0}]
5-gram: 354 of the Penal Code, Count: 41, Log Probability: [{'354 of the Penal Code': 0.0}]
5-gram: Section 354 of the Penal, Count: 39, Log Probability: [{'Section 354 of the Penal': 0.0}]
5-gram: of the Penal Code Chapter, Count: 39, Log Probability: [{'of the Penal Code Chapter': 0.0}]
5-gram: the Penal Code Chapter 224, Count: 39, Log Probability: [{'the Penal Code Chapter 224': 0.0}]
5-gram: invited to sign below the, Count: 36, Log Probability: [{'invited to sign below the': 0.0}]
5-gram: under 14 years of age, Count: 31, Log Probability: [{'under 14 years of age': 0.0}]
5-gram: do you have to say, Count: 30, Log Probability: [{'do you have to say': 0.0}]
5-gram: under Section 354 of the, Count: 26, Log Probability: [{'under Section 354 of the': 0.0}]
5-gram: 122 Bedok North Street 2, Count: 24, Log Probability: [{'122 Bedok North Street 2': 0.0}]
5-gram: Bedok North Street 2 04, Count