# Functions
## Please note the above functions are from MBOD Comm313 class!
________

In [6]:
import requests
import time
import random
import json
import calendar
import datetime as dt
import os
import zipfile

from collections import Counter

from IPython.display import HTML
from bs4 import BeautifulSoup

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import math
import re

import nltk


In [7]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [1]:
#characters_to_remove = '!,.()[]"'

In [9]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [10]:
def collocates(tokens, kw, win=[4,4]):
    '''return the collocates in a window around a given keyword
    
    Args:
          tokens -- a list of tokens
          kw     -- keyword string to find and get collocates for
          win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
          a list of contexts (matching window specification) around each instance of keyword in tokens
    '''
    hits = [p for p,t in enumerate(tokens) if t==kw]
    
    context=[]
    for hit in hits:
        left = [] if win[0]<1 else tokens[hit-win[0]:hit]
        right = [] if win[1]<1 else tokens[hit+1:hit+win[1]+1]
        
        context.extend(left)
        context.extend(right)
        
    return context

In [11]:
def get_colls(texts,kw, win=[4,4]):
    '''create a collocate frequency list for instances of a kw in a list of texts
    
    Args:
        texts  -- a list of tokenized texts
        kw     -- keyword string to find and get collocates for
        win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
        a list-of-tuples where each tuple is (collocate, freq_with_kw, coll_total_freq)
    '''
    word_dist = Counter()
    colls = Counter()
    for text, tokens in texts.items():
        word_dist.update(tokens)
        colls.update(collocates(tokens,kw, win))
    
    return [(str(k),v, word_dist[k]) for k,v in colls.items()], word_dist.get(kw), sum(word_dist.values())

In [12]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : hit[1]+win+1]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append([left, kw, right])
        
    return lines

In [13]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      win*10, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )    

In [14]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = 3-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

_____________
# Custom Function for Dr Turow

In [None]:
def JTurow_POS_Extract(directory_with_txt_file):
#######

    '''
    Etienne Jacquot - ASC IT SYSADMIN
    09/05/19
    
    Function for Dr Turow, extracting NLP data from Factivia articles related to Smart Devices:
    This is a draft work-in-progress, as in not entirely robust in terms of entering desired POS tags...

    Input - directory_with_txt_file: 
        This should be a string value, the directory with article files in question. 
        Make sure this file is converted from .rtf Factivia download to .txt utf-16
            
    Output - total_most_common_nltk_NVJ: 
        This is a Counter() object which contains most common tokens
        Nouns, Verbs, and Adjectives found in articles contained in the text file
    '''


    # Read file of text data
    for item in os.listdir(directory_with_txt_file):
        if item.endswith('.txt'):
            txt_file = item
            print('Opening the following text file:', item, '\n')
            text = open(directory_with_txt_file + item,'r',encoding="UTF-16").read().splitlines()

    # Preparing to capture data as list of dictionaries
    articles_total=[]
    articles = {}
    article_txt=[]

    print('Extracting data...\n')
    # Go through lines in article to separate individual articles
    for line in text:
        article_txt += [line]
        # Each article ends with this Document Line
        if line.startswith('Document ') and len(line) == 34:
            doc_id = line.split()[1]
            articles = {'document_ID':doc_id,
                        'article_text':article_txt, 
                        'tokens':[],
                        'POS_tag':[]
                            }
            articles_total.append(articles)    
            article_txt = []
    print('Total number of articles in text file:', len(articles_total),'\n')

    # Extracting tokens from article texts and updating the dictionary
    print('Tokenizing text in each article (lowercase & removing the following chars:', characters_to_remove,')\n')
    tokens = []
    total_tokens = []
    for article in articles_total:
        for words in article['article_text']:
            tokens += tokenize(words,strip_chars=characters_to_remove,lowercase=True)
        # Also creating total_tokens which is all tokens in one list
        total_tokens += tokens
        article['tokens']+= tokens
        tokens = []

    # Applying POS tagging
    print('Applying POS tags on tokens, please be patient...\n')
    for article in articles_total:
        for words in article['tokens']:
            #print(words,'... now working on NLTK:')
            word = [words]
            #print('word is:',word)
            nltk_text = nltk.pos_tag(word)
            #print('nltk is:',nltk_text,'\n')
            article['POS_tag'] += nltk_text

    # Identifying words that match target POS tag
    print('Extracting most frequent tokens with POS Tags: NN*, V*, and JJ*','\n')
    total_nltk_tokens = []
    target_nltk_tokens = []
    IN_and_PRP_tokens = []
    NNJJV_nlkt_tokens = []

    for article in articles_total:
        for words in article['POS_tag']:
            total_nltk_tokens += [words]

            # Excluding Prepositions & Pronouns as requested in original ticket by Dr Turow
            # targeted_ = (words[1] == 'IN' or words[1] == 'FW')
            #if not targeted_:
                #target_nltk_tokens += [words]

            # Including Nouns, Verbs, Adjectives
            targeted_nouns_verbs_adj = (words[1].startswith('NN') or words[1].startswith('JJ') or words[1].startswith('V'))
            if targeted_nouns_verbs_adj:
                NNJJV_nlkt_tokens += [words]

            else:
                IN_and_PRP_tokens += [words]

    if (len(total_nltk_tokens) - len(target_nltk_tokens)) or (len(total_nltk_tokens) - len(NNJJV_nlkt_tokens)) == len(IN_and_PRP_tokens):
        print('Successfully extracted all words that match target POS tagging!\n')

    # Counting most common words that match targetted POS
    print('Counting most frequent Nouns, Verbs, & Adjectives in file:\n')
    total_most_common_nltk_NVJ = Counter()
    for tokens in NNJJV_nlkt_tokens:
        total_most_common_nltk_NVJ.update(get_ngram_tokens([tokens],1))
    #print(total_most_common_nltk_NVJ.most_common(30))
    print('Completed!\n')