In [21]:
# imports
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict
import numpy as np
import spacy
import os
import pandas as pd
import collections
%matplotlib inline

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
### Creating DataFrame

idir = "/home/akhilesh/civicdatalab/samantar_parsers/data/txts/"
df = pd.DataFrame()
all_data = []
for filename in os.listdir(idir):
    if filename.endswith('.txt'):
        with open(idir + filename, 'r') as f:
            data = f.readlines()
        stripped_data = list(map(lambda x: x.strip(), data))
        clean_data = ' '.join(stripped_data)
        corr = (filename.split(".")[0], clean_data)
        all_data.append(corr)

df = pd.DataFrame(all_data)
df.columns = ['year', 'content']

In [4]:
df.head()

Unnamed: 0,year,content
0,1973_-_1974_,SPEECH OF SHRI Y.B.CHAVAN MINISTER OF FINANCE ...
1,1976_-_1977_,SPEECH OF SHRI C.SUBRAMANIAM MINISTER OF FINAN...
2,1958_-_1959_,SPEECH OF SHRI JAWAHAR LAL NEHRU PRIME MINIST...
3,2001_-_2002_,1 Budget 2001-2002 Speech of Shri Yash...
4,1995_-_1996_,1 Budget 1995-96 Speech of Shri Manmohan...


In [5]:
df['tokenised'] = df.content.apply(lambda x: list(nlp.tokenizer(x)))

In [9]:
df['tokenised'] = df['tokenised'].apply(lambda x: str(x))

In [11]:
from measures.freq_based import sichel_s
from measures.freq_based import michea_m
from measures.freq_based import honore_h
from measures.freq_based import entropy
from measures.freq_based import yule_k
from measures.freq_based import simpson_d
from measures.freq_based import herdan_vm
from measures.freq_based import hdd

In [17]:
measures = dict(
    sichel_s=sichel_s,
    michea_m=michea_m,
    honore_h=honore_h,
    entropy=entropy,
    yule_k=yule_k,
    simpson_d=simpson_d,
    herdan_vm=herdan_vm,
    hdd=hdd
)

In [13]:
def get_freq_spectrum(tokens: List[str]) -> Dict[int, int]:
    freq_list = collections.Counter(tokens)
    freq_spectrum = dict(collections.Counter(freq_list.values()))
    return freq_spectrum

In [25]:
window_size = 100
def get_measure(tokens: List[str], measure: str) -> np.float64:
    func = measures[measure]
    total_measure = []
    if tokens:
    #     freq_spectrum = get_freq_spectrum(tokens)
    #     vocab_size = len(set(tokens))
    #     total_measure.append(func(vocab_size, freq_spectrum))
        for i in range(int(len(tokens) / window_size)):
            chunk = tokens[i * window_size : (i * window_size) + window_size]
            freq_spectrum = get_freq_spectrum(tokens)
            vocab_size = len(set(tokens))
            txt_len = len(tokens)
            result = func(vocab_size, freq_spectrum)
            total_measure.append(result)
        return np.mean(total_measure)
    return None

In [22]:
df['sichel_s'] = df['tokenised'].apply(lambda x: get_measure(x, 'sichel_s'))

  out=out, **kwargs)


In [27]:
df.to_csv('sichel_s.csv')

In [28]:
window_size = 100
def get_other_measure(tokens: List[str], measure: str) -> np.float64:
    func = measures[measure]
    total_measure = []
#     freq_spectrum = get_freq_spectrum(tokens)
#     vocab_size = len(set(tokens))
#     total_measure.append(func(vocab_size, freq_spectrum))
    for i in range(int(len(tokens) / window_size)):
        chunk = tokens[i * window_size : (i * window_size) + window_size]
        freq_spectrum = get_freq_spectrum(tokens)
#         vocab_size = len(set(tokens))
        txt_len = len(tokens)
        result = func(txt_len, freq_spectrum)
        total_measure.append(result)
    return np.mean(total_measure)

In [29]:
df['entropy'] = df['tokenised'].apply(lambda x: get_measure(x, 'entropy'))

In [30]:
df['yule_k'] = df['tokenised'].apply(lambda x: get_measure(x, 'yule_k'))

In [31]:
df['simpson_d'] = df['tokenised'].apply(lambda x: get_measure(x, 'simpson_d'))

In [32]:
df['hdd'] = df['tokenised'].apply(lambda x: get_measure(x, 'hdd'))

In [33]:
df.to_csv('most_of_freq_measures.csv')

In [36]:
def get_all_other_measure(tokens: List[str], measure: str) -> np.float64:
    func = measures[measure]
    total_measure = []
#     freq_spectrum = get_freq_spectrum(tokens)
#     vocab_size = len(set(tokens))
#     total_measure.append(func(vocab_size, freq_spectrum))
    for i in range(int(len(tokens) / window_size)):
        chunk = tokens[i * window_size : (i * window_size) + window_size]
        freq_spectrum = get_freq_spectrum(tokens)
        vocab_size = len(set(tokens))
        txt_len = len(tokens)
        result = func(txt_len, vocab_size, freq_spectrum)
        total_measure.append(result)
    return np.mean(total_measure)

In [38]:
df['honore_h'] = df['tokenised'].apply(lambda x: get_all_other_measure(x, 'honore_h'))

  out=out, **kwargs)


In [39]:
df['herdan_vm'] = df['tokenised'].apply(lambda x: get_all_other_measure(x, 'herdan_vm'))

In [41]:
df.to_csv('all_freq_measures.csv')

In [42]:
df.head()

Unnamed: 0,year,content,tokenised,sichel_s,entropy,yule_k,simpson_d,hdd,honore_h,herdan_vm
0,1973_-_1974_,SPEECH OF SHRI Y.B.CHAVAN MINISTER OF FINANCE ...,"[SPEECH, OF, SHRI, Y.B.CHAVAN, MINISTER, OF, F...",0.038961,-4174.927699,948283800.0,96062.553315,0.853017,1241.155909,0.275718
1,1976_-_1977_,SPEECH OF SHRI C.SUBRAMANIAM MINISTER OF FINAN...,"[SPEECH, OF, SHRI, C.SUBRAMANIAM, MINISTER, OF...",0.04,-5782.814368,1595972000.0,161735.77982,0.886989,1234.412281,0.273806
2,1958_-_1959_,SPEECH OF SHRI JAWAHAR LAL NEHRU PRIME MINIST...,"[SPEECH, OF, SHRI, JAWAHAR, LAL, NEHRU, , PRI...",0.025316,-3065.451087,607143800.0,61482.409932,0.875933,1231.231366,0.282953
3,2001_-_2002_,1 Budget 2001-2002 Speech of Shri Yash...,"[1, , Budget, , 2001, -, 2002, , Speech, ...",0.011628,-8626.306678,4572081000.0,462566.124487,0.911511,1267.230452,0.365504
4,1995_-_1996_,1 Budget 1995-96 Speech of Shri Manmohan...,"[1, , Budget, , 1995, -, 96, , Speech, , o...",0.050633,-7449.724035,3572864000.0,361846.910094,0.914448,1235.173825,0.364495


In [44]:
df.loc[df['hdd'].argmin()]

The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  """Entry point for launching an IPython kernel.


year                                          1977_-_1978_(I)_
content      SPEECH OF SHRI H.M.PATEL MINISTER OF FINANCE  ...
tokenised    [SPEECH, OF, SHRI, H.M.PATEL, MINISTER, OF, FI...
sichel_s                                                     0
entropy                                               -154.563
yule_k                                             8.20423e+06
simpson_d                                              831.744
hdd                                                   0.692829
honore_h                                               963.709
herdan_vm                                             0.268495
Name: 32, dtype: object