# Generate Sparse Representations 

Version: 2.0

Environment: Python 3.6.3 and Anaconda 4.3.0 (64-bit)

Libraries used: 
* re (for regular expression, included in Anaconda Python 3.6)
* os to join and load all files
* nltk to process unigrams
* nltk probability to find frequency

## 1.  Import libraries 

In [None]:
import re
import os
import nltk
from nltk.tokenize import RegexpTokenizer
import multiprocessing as mp
from nltk.probability import *

## 2. Tokenize

In this section, we will perform the following tasks:
* load data
* instantiate tokenizer, the regex was provided
* open stop words list
* examine content

In [None]:
# tokenizer string provided with assignment
tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)?")
# load all text files and append to a list
#file_Path = "C:\\Users\\AshSu\\Downloads\\meeting_transcripts_student\\meeting_transcripts_student\\txt_files"
file_Path = "./txt_files"
# list to append each text to
All_texts = []

# function provided in tutorials
for txt_object in os.listdir(file_Path):
    file = os.path.join(file_Path, txt_object)
    with open(file, 'r') as Text_File:
        # load all lines
        text = Text_File.readlines()
        All_texts.append(text)

In [None]:
# open Stop Words file
with open('stopwords_en.txt','r') as Stopword_file:
    stop_Words = Stopword_file.read()
    
stop_Words = stop_Words.split('\n')

In [None]:
# print a few lines in the All_text list to see that it worked properly
print(All_texts[50:100])


In [None]:
# how do the topic breaks appear in our text files
for text_file in All_texts:
    for word in text_file:
        if word.startswith("**********"):
            print([word])

In [None]:
# we want to capture all breaks in the topic
# there are two conditions in which the break in topic occurs appears
# simply as 10 asterix
for text_file in All_texts:
    for word in text_file:
        if word == '**********':
            print([word])
        # or joined by a new line 
        if word == '**********\n':
            print([word[0:10]])


## 2. Tokenize

In this section, we will perform the following tasks:
* apply the tokenizer provided
* examine content

In [None]:
def Tokenize_text(List_oftextfiles):
    # make a list of all the tokens, the idea is that each line in each text becomes its own list of tokens
    # such as the example provided in the lecture
    Unigrams = []
    # iterate through all the text files that have been joined together in a list
    for text_file in List_oftextfiles:
        # build a new list to collect all the individual tokens
        Wordtokens = []
        for word in text_file:
            # we want to capture all breaks in the topic as well
            # there are two conditions in which the segment appears
            # simply as 10 asterix
            if word == '**********':
                Wordtokens.append([word])
            # or joined by a new line if there is a corresponding segment break
            if word == '**********\n':
                # we want to skip the "\n", so we only append the asterix
                Wordtokens.append([word[0:10]])
            else:
                # since the words must be converted into lower case we can convert each words within the tokenizer function
                token = tokenizer.tokenize(word.lower())
                Wordtokens.append(token)
        Unigrams.append(Wordtokens)
        

    return Unigrams

In [None]:
Tokenize_text(All_texts[50:100])

In [None]:
tokenized_text =Tokenize_text(All_texts)

for text in textList:
        for row in text:
            print(row)
            for word in row:
                print(word)


## 3. Remove all Stop words 

In this section, we will perform the following tasks:
* build a function that removes stop words


In [None]:
def text_stopwords(texts):
    # intantiate a list that will contain other lists, each list being a text, which contains other lists, each list here being
    # a particular sentence
    Text_withoutSW = []
    # use a for loop to iterate over the tokenized texts
    for text in texts:
        # instantiate another list, this will be the list that contains the separated lines
        TokensList_withoutSW = []
        for line in text:
            # borrowed from lecture material "Exploring Pre-Processed text and Generating Features "
            wordsSet = [word for word in line if word not in stop_Words]
            # make sure wordsSet is not empty
            if wordsSet:
                # then append list TokensList_withoutSW list
                TokensList_withoutSW.append(wordsSet)
        # then append this list for each text
        Text_withoutSW.append(TokensList_withoutSW)
        
    return Text_withoutSW

## 4. Find words that have a frequency greater than 132 

In this section, we will perform the following tasks:
* build a function that collects all the words that appear more than 132 times
* build another function that removes these words

In [None]:
# using set method find all unique words in the text overall
def Unique_words(Alltext):
    One_text = text_stopwords(Tokenize_text(Alltext))
    New_set = set()
    for text in One_text:
        for line in text:
            for word in line:
                if word is "**********":
                    pass
                else:
                    New_set.add(word)
    
    return New_set

In [None]:
def findfrequency(All_text, Unique):
    Common_words = []
    All_possiblewords = []
    for text in All_text:
        Set_2 = set()
        for row in text:
            for word in row:
                if word =='**********':
                    pass
                else:
                    Set_2.add(word)
        All_possiblewords.append(Set_2)
    for word in Unique:
        x = 0
        for List in All_possiblewords:
            if word in List:
                x += 1
                
        if x > 132:
            Common_words.append(word)
    return(Common_words)
        

In [None]:
Common_Words = findfrequency(text_stopwords(Tokenize_text(All_texts)), Unique_words(All_texts))

In [None]:
print(Common_Words)

In [None]:
blah_blah = All_texts
for x, text in enumerate(blah_blah):
    for ID, line in enumerate(text):
        print(index, line)
        for index, word in enumerate(line):
            print(x, ID, index)

In [None]:
def find_index(AllText):
    some_list = []
    Common_Words = findfrequency(text_stopwords(Tokenize_text(AllText)), Unique_words(AllText))

    for com_word in Common_Words:
        
        for idx,text in enumerate(AllText):
            for idy, line in enumerate(text):
                for idz, word in enumerate(line):
                    if com_word == word:
                        some_list.append([idx][idy][idz])
        
                        
    return some_list


                    


In [None]:
Common_Words = findfrequency(text_stopwords(Tokenize_text(All_texts)), Unique_words(All_texts))
some_list = []
for com_word in Common_Words:
    for idx,text in enumerate(All_texts):
        for idy, line in enumerate(text):
            for idz, word in enumerate(line):
                if com_word == word:
                    print(idx,idy,idz)

In [None]:
print(some_list)

In [None]:
some_otherlist = find_index(All_texts)
    
                    

In [None]:
def Vocabulary(Unique):
    unique = list(Unique)
    for x in range(len(unique)):
        unique[x] = unique[x] + ":" + str(x)

    Vocab_String = '\n'.join(unique)
    
    return Vocab_String
    

In [None]:
ok = Unique_words(All_texts)
Vocab = Vocabulary(ok)

In [None]:
Vocab

In [None]:

with open('vocab.txt', 'w') as VocabFile:
    VocabFile.write(Vocab)

## 4. References
* Bird, S., Klein, E., & Loper, E. (2009). Natural language processing with Python:. Beijing: OReilly.
* Is there a more Pythonic way to prevent adding a duplicate to a list? (n.d.). Retrieved from https://stackoverflow.com/questions/19834806/is-there-a-more-pythonic-way-to-prevent-adding-a-duplicate-to-a-list
