In [1]:
import pandas as pd
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams

### Reading in the data

In [2]:
filings = pd.read_csv('data/nc_validation_filings.csv')
filings.head()

Unnamed: 0,ticker,accession_number
0,ACAD,0001564590-18-003526
1,ACC,0001283630-18-000024
2,AFSI,0001365555-18-000052
3,AHL,0001267395-18-000024
4,AKRX,0001628280-18-002518


In [3]:
purchase = pd.read_csv('data/share_repurchase_paragraphs.csv')
aapl = purchase[purchase['ticker'] == 'AAPL']
aapl

Unnamed: 0,ticker,accession_number,data_key_friendly_name,text,data_value,reported_data_value,reported_units,paragraph_text
15,AAPL,0000320193-17-000070,Share Repurchase Authorization,share repurchase authorization,210000000000,210.0,billions,"Share Repurchase Program In May 2017, the Co..."
16,AAPL,0000320193-17-000070,Share Repurchase Utilization,utilized,166000000000,166.0,billions,"Share Repurchase Program In May 2017, the Co..."
17,AAPL,0000320193-17-000070,Amount Spent on Share Repurchases,Total open market common stock repurchases,18001000000,18001.0,millions,Number of Average ...
18,AAPL,0000320193-17-000070,Amount Spent on Share Repurchases,February 2017 ASR,3000000000,3000.0,millions,Number of Average ASR ...
19,AAPL,0000320193-17-000070,Share Repurchase Count,May 2017 ASR,20108000,20108.0,thousands,Number of Average ASR ...
20,AAPL,0000320193-17-000070,Share Repurchase Count,August 2017 ASR,15069000,15069.0,thousands,Number of Average ASR ...
21,AAPL,0000320193-17-000070,Share Repurchase Count,Total open market common stock repurchases,134832000,134832.0,thousands,Number of Average ...
22,AAPL,0000320193-17-000070,Share Repurchase Count,November 2016 ASR,51157000,51157.0,thousands,Number of Average ASR ...
23,AAPL,0000320193-17-000070,Share Repurchase Count,February 2017 ASR,20949000,20949.0,thousands,Number of Average ASR ...
24,AAPL,0000320193-17-000070,Share Repurchase Intention,Total,44023000000,44023.0,millions,Total Number of Approximate Dollar Valu...


### Exploratory Data Analysis

In [4]:
key_text = purchase["data_key_friendly_name"].unique()
key_text

array(['Share Repurchase Authorization Date',
       'Share Repurchase Authorization', 'Share Repurchase Intention',
       'Share Repurchase Count', 'Amount Spent on Share Repurchases',
       'Share Repurchase Utilization', 'Unknown Share Repurchase Data'], dtype=object)

In [23]:
text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Authorization'].text
text

1       The 2015 share repurchase program authorizes t...
6                   share repurchase programs aggregating
13                                     repurchase program
15                         share repurchase authorization
32                                             authorized
34      the Company's board of directors authorized a ...
38                          authorization was in addition
39                              authorized the repurchase
46                                             repurchase
50      In the aggregate, the Board of Directors has a...
58          total stock repurchase authorization of up to
80              the Company's Board of Directors approved
87      our Board of Directors authorized an additiona...
94      publicly announced share repurchase authorizat...
100                   the Board of Directors authorized a
109     we announced that our Board of Directors had a...
115                   the Board of Directors authorized a
122           

### Processing the Data : Tokenizing and removing stop words for each of the data_key_friendly_name and picking the most common words

In [6]:
tokens =   [word_tokenize(t.lower()) for t in text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
repurchase_text = Counter(no_stops)  
repurchase_text.most_common(20)



[('repurchase', 260),
 ('board', 227),
 ('authorized', 202),
 ('directors', 188),
 ('program', 126),
 ('company', 93),
 ('share', 78),
 ("'s", 66),
 ('approved', 63),
 ('stock', 54),
 ('additional', 52),
 ('authorization', 50),
 (',', 42),
 ('new', 37),
 ('announced', 31),
 ('purchase', 29),
 ('common', 25),
 ('aggregate', 19),
 ('authorizing', 18),
 ('total', 16)]

In [8]:
para_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Authorization'].paragraph_text
para_text.head()

1       On May 28, 2015 we  announced that our board  ...
6       4. Share Repurchase Programs and Dividends   S...
13      The Company's stock repurchase program allows ...
15      Share Repurchase Program   In May 2017, the Co...
32      On February 15, 2018, AbbVie's board of direct...
34      In November 2016, the  Company's board of dire...
38      In September 2014, the board of directors auth...
39      In September 2014, the board of directors auth...
46      Stock Repurchase Program   To facilitate  our ...
50      Common Stock Repurchases   The Company's commo...
58      In January 2017, the Company's Board of Direct...
80      On September 25, 2017, the Company's Board of ...
87      Our Board of Directors has authorized the repu...
94      Shares purchased pursuant to the November 14, ...
100     13. Stockholders' Equity   Stock Repurchase Pr...
109     April 2016 Repurchase Program   On April 28, 2...
115     Common Stock Repurchase   In May 2014, the Boa...
122     Common

In [7]:
# alpha_token =  [t for t in list_tokens if t.isalpha()]
# alpha_token

In [20]:
para_tokens =   [word_tokenize(t.lower()) for t in para_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(para_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
repurchase_para = Counter(no_stops)  
repurchase_para.most_common(20)


[(',', 2207),
 ('$', 1063),
 ('repurchase', 1046),
 ('stock', 788),
 ('program', 759),
 ('.', 728),
 ('2017', 573),
 ('common', 548),
 ('board', 529),
 ('company', 525),
 ('million', 522),
 ('share', 511),
 ('billion', 501),
 ('shares', 491),
 ('directors', 441),
 ('authorized', 412),
 ("'s", 349),
 ('2016', 256),
 ('december', 254),
 ('repurchases', 244)]

In [26]:
intent_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Intention'].text
util_text = purchase[purchase["data_key_friendly_name"] == 'Share Repurchase Utilization'].text
unk_repurchase_text = purchase[purchase["data_key_friendly_name"] == 'Unknown Share Repurchase Data'].text


In [None]:
# bag_repurchase = ['Board', 'Directors','authoriz','purchase','approve','program']

In [27]:
intent_tokens =   [word_tokenize(t.lower()) for t in intent_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(intent_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
intent_text = Counter(no_stops)  
intent_text.most_common(20)

[('repurchase', 104),
 ('2017', 76),
 (',', 72),
 ('program', 62),
 ('remaining', 58),
 ('december', 49),
 ('authorization', 44),
 ('available', 40),
 ('31', 39),
 ('share', 38),
 ('remained', 34),
 ('board', 34),
 ('authorized', 34),
 ('stock', 29),
 ('total', 26),
 ('company', 22),
 ('directors', 22),
 ('1', 21),
 ('-', 17),
 ('common', 16)]

In [28]:
util_tokens =   [word_tokenize(t.lower()) for t in util_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(util_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
util_text = Counter(no_stops)  
util_text.most_common(20)

[('repurchased', 14),
 ('total', 12),
 (',', 12),
 ('cost', 11),
 ('aggregate', 9),
 ('2017', 8),
 ('company', 7),
 ('approximately', 5),
 ('program', 5),
 ('repurchase', 5),
 ('december', 4),
 ('purchased', 4),
 ('share', 4),
 ('shares', 3),
 ('31', 3),
 ('authorization', 3),
 ('inception', 2),
 ('amount', 2),
 ('board', 2),
 ('directors', 2)]

In [29]:
unk_tokens =   [word_tokenize(t.lower()) for t in unk_repurchase_text]
def flatten(lst):
    return sum( ([x] if not isinstance(x, list) else flatten(x)
         for x in lst), [] )
list_tokens = flatten(unk_tokens)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_tokens if t not in english_stops]
unk_text = Counter(no_stops)  
unk_text.most_common(20)

[('repurchase', 79),
 ('board', 58),
 ('authorized', 56),
 ('directors', 53),
 ('program', 31),
 ('company', 27),
 ('shares', 25),
 ("'s", 21),
 ('authorization', 18),
 ('share', 15),
 (',', 13),
 ('approved', 12),
 ('available', 12),
 ('purchase', 12),
 ('repurchased', 11),
 ('total', 11),
 ('december', 11),
 ('2017', 9),
 ('stock', 9),
 ('additional', 8)]

### ngrams NLP analysis for data_key_friendly_name text field

In [30]:
    
def get_ngrams(text, n ):
        n_grams = ngrams(word_tokenize(t), n)
        return [ ' '.join(grams) for grams in n_grams]
bigrams_text = []    
for t in text:
        bigrams_text.append(get_ngrams(t, 3 )) 
# bigrams_text
list_bigrams = flatten(bigrams_text)
english_stops = set(stopwords.words('english'))
no_stops = [t for t in list_bigrams if t not in english_stops]
repurchase_text_bi = Counter(no_stops)  
repurchase_text_bi.most_common(20)


  after removing the cwd from sys.path.


[('Board of Directors', 158),
 ('of Directors authorized', 88),
 ('of up to', 84),
 ('the repurchase of', 55),
 ('our Board of', 54),
 ('share repurchase program', 52),
 ("'s Board of", 46),
 ('authorized the repurchase', 42),
 ('the Board of', 41),
 ("the Company 's", 38),
 ('repurchase up to', 38),
 ('of Directors approved', 37),
 ('repurchase of up', 37),
 ('to repurchase up', 35),
 ('Directors authorized a', 34),
 ("Company 's Board", 33),
 ('Directors authorized the', 31),
 ('stock repurchase program', 31),
 ('Directors approved a', 30),
 ('board of directors', 28)]