In [43]:
import json

In [44]:
def readFile(filepath): 
    
    with open(filepath, 'r') as f:
        stories = json.load(f)
        
    return stories

In [45]:
stories = readFile('Cleaned/cleaned_scale.json')

In [46]:
def frequency(stories):
    
    frequent = {}
    
    for story in stories:
        
        text = story['cleaned_text']
        
        for word in text:
            
            if word.isnumeric():
                continue
            
            elif word in frequent:
                frequent[word] += 1
                
            else:
                frequent[word] = 1
                  
    frequent = [(k, v) for k, v in frequent.items()]  
    frequent = sorted(frequent, key=lambda x: x[-1], reverse = True)
                
    return frequent

In [108]:
def formatFrequencyList(frequency_list, remove_count):
    
    formatted = []
    
    if (remove_count):
        
        for tuple in frequency_list:
        
            obj = tuple[0]
          
            formatted.append(obj)
        
    else:
    
        for tuple in frequency_list:

            obj = {
                'word': tuple[0],
                'count': tuple[1]
            } 

            formatted.append(obj)
        
    return formatted

In [99]:
def removeDuplicates(collection) :
    
    dictionary = {}
    no_duplicates = []
    
    for story in collection:
        
        title = story['title']
        
        if (title in dictionary):
            continue
        else:
            dictionary[title] = True
            no_duplicates.append(story)
            
            
    return no_duplicates

In [100]:
def createCollection(filepaths):
    
    if filepaths == [] or None: 
        
        raise ValueError('var input filepaths is an Empty Array') 
        
    elif isinstance(filepaths, str):
        
        return readFile(filepaths)
    
    else:
    
        stories = []

        for filepath in filepaths:

            collection = readFile(filepath)
            stories += collection
            
    stories = removeDuplicates(stories)
        
    return stories

In [101]:
def extract(filepaths, keyword, export_file_extension):
    
    stories = createCollection(filepaths)
    
    frequency_list = frequency(stories)
    formatted = formatFrequencyList(frequency_list)

    with open('Frequency/frequency_list_' + keyword + export_file_extension, 'w') as data:
        json.dump(formatted, data)
    
    return

In [51]:
# Extract conversion lists and export to JSON file

extract(['Cleaned/cleaned_scale.json', 'Cleaned/cleaned_not_scale.json'], 'scale_all', '.txt')

In [52]:
# Helper Functions

In [53]:
def display(frequency_list, list_size) :
    
    for i in range(list_size):
        print(frequency_list[i])
        
    return

In [109]:
inclusive = createCollection(['Cleaned/cleaned_scale.json', 'Cleaned/cleaned_not_scale.json'])
contains = createCollection('Cleaned/cleaned_scale.json')
not_contains = createCollection('Cleaned/cleaned_not_scale.json')

In [115]:
def label(stories, frequency_list):
    
    all_lists = []
    
    # Format frequency_list for labels
    
    frequency_list = formatFrequencyList(frequency_list, True)
    
    for story in stories:
        
        dictionary = {}
        
        text = story['cleaned_text']
          
        for word in frequency_list:
            
            # Why is 'word' in the formate of: (word, index) ? 
            # TODO: Fix this formatting
            
            if word in text: 
                dictionary[word] = True
            else:
                dictionary[word] = False
        
        all_lists.append(dictionary)
    
    
    return all_lists

In [121]:
# Examples 

In [124]:
stories = readFile('Cleaned/cleaned_scale.json')[0:1]

frequency_list = frequency(contains)[0:10]

label(stories, frequency_list)

[{'one': True,
  'also': True,
  'scale': True,
  'people': True,
  'year': True,
  'like': True,
  'years': True,
  'time': True,
  'many': True,
  'could': True}]

In [125]:
# Tuple[1] is occurence count

display(frequency(contains), 10)

('one', 1489)
('also', 1422)
('scale', 1412)
('people', 1373)
('year', 1368)
('like', 1358)
('years', 1331)
('time', 1283)
('many', 1258)
('could', 1248)
