#### LSA

In [60]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
# nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import numpy as np

In [4]:
df = pd.read_csv('reg2-dynamic.csv').values.tolist()
rules = []
for i in df:
    rules.append(i[0])

In [7]:
df = pd.DataFrame()
df["clean_documents"] = rules

In [9]:
stop_words = stopwords.words('english')

# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc

In [37]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

# Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,18. (1) The authorised officer executing warra...,0.1014579883848737,-0.0185909005733574
1,(3)Research analyst research entity shall cond...,0.2799278597262391,-0.1094861439204884
2,(3)An investment adviser shall conduct yearly ...,0.3070436214896546,-0.0696399573883059
3,(3)All members board directors senior manageme...,0.2690732306662852,-0.1761671030611074
4,(2)The listed entity shall conduct meetings au...,0.2757193972610606,-0.0116788213238189
5,(5)(a) The board directors shall lay code cond...,0.343169367381966,-0.2018099176756506
6,4.(1) The listed entity listed securities shal...,0.3471973228039493,-0.0962431546837303
7,"23.A foreign portfolio investor shall, times, ...",0.2624174860285858,-0.1391336749518223
8,"(3)Every listed company, 36[intermediary] pers...",0.3369119359214588,-0.2449234323376657
9,“Every person required handle unpublished pric...,0.5305418132188672,-0.420646883082613


In [56]:
small_count_vectorizer = CountVectorizer(stop_words='english', max_features=40000)
small_text_sample = df['clean_documents'].sample(n=10000, random_state=0, replace=True).values

print('Headline before vectorization: {}'.format(small_text_sample[123]))

small_document_term_matrix = small_count_vectorizer.fit_transform(small_text_sample)

print('Headline after vectorization: \n{}'.format(small_document_term_matrix[123]))

Headline before vectorization: (3)The appointment managing director shall term exceeding five years: Provided post completion first term Managing Director, depository shall conduct appointment process afresh:
Headline after vectorization: 
  (0, 484)	2
  (0, 412)	1
  (0, 133)	1
  (0, 70)	2
  (0, 309)	2
  (0, 172)	2
  (0, 505)	2
  (0, 203)	1
  (0, 548)	1
  (0, 382)	1
  (0, 127)	1
  (0, 162)	1
  (0, 405)	1
  (0, 58)	1


In [42]:
# Define helper functions
def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [73]:
lsa_keys = get_keys(lsa)
lsa_keys

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [53]:
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)

In [63]:
small_document_term_matrix[1]

<1x549 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [70]:
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(2):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [71]:
top_n_words_lsa = get_top_n_words(10, lsa_keys, small_document_term_matrix, small_count_vectorizer)

for i in range(len(top_n_words_lsa)):
    print("Topic {}: ".format(i+1), top_n_words_lsa[i])

Topic 1:  shall valuer valuation reit information regulations invit conduct board ensure
Topic 2:  shall conduct person manager trustee provided warrant unit regulations pertaining


In [38]:
var_explained = svd_model.explained_variance_ratio_.sum()
var_explained

0.10548284864443293

In [13]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [14]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
06,0.0053882313816039,-0.0018722823637486
09,0.0053882313816040,-0.0018722823637485
10,0.0082635485846299,0.0023583478843440
10inserted,0.0053882313816040,-0.0018722823637485
11,0.0043806509590642,0.0036636312776291
...,...,...
witnesses,0.0058158011895448,-0.0013358428187164
writing,0.0159811140989248,0.0010494484462032
year,0.1978473145580819,0.3801991246753730
yearly,0.0895625273990487,0.1296274452022161


In [39]:
r = topic_encoded_df[["documents", "topic_1", "topic_2"]].values.tolist()
len(r)

33

In [74]:
r0 = []
r1 = []
t = 0
for i in lsa_keys:
    if(i==0):
        r0.append(r[t])
    else:
        r1.append(r[t])
    t=t+1

In [75]:
r0

[['18. (1) The authorised officer executing warrant authority, shall,- (a)identify either name official identification documents; (b)show warrant authority person occupies premises person charge vessel, vehicle aircraft authorised searched request provide copy warrant authority: Provided person present copy warrant shall pasted prominent place premises, vessel, vehicle aircraft authorised searched; (c)conduct search seizure any, presence witnesses prepare panchnama prescribed Form-G Schedule witnesses available. (d)deliver copy seizure memo prepared regulation 14 person whose possession control documents seized;',
  0.10145798838487373,
  -0.018590900573357407],
 ['(3)Research analyst research entity shall conduct annual audit respect compliance regulations member Institute Chartered Accountants India Institute Company Secretaries India.',
  0.27992785972623907,
  -0.10948614392048844],
 ['(3)An investment adviser shall conduct yearly audit respect compliance regulations member Institu

In [29]:
si = '(5)(a) The board directors shall lay code conduct members board directors senior management listed entity. (b)The code conduct shall suitably incorporate duties independent directors laid Companies Act, 2013.'
sj = '10.While arriving settlement terms, factors indicated Schedule-II may considered, including limited, following: (a)conduct applicant specified proceeding, investigation, inspection audit; (b)the role played applicant case alleged default committed group persons; (c)nature, gravity impact alleged defaults; (d)whether proceeding applicant non-compliance securities laws pending concluded; (e)the extent harm and/or loss investors’ and/or gains made applicant; (f)processes introduced since alleged default minimize future defaults lapses; (g)compliance schedule proposed applicant; (h)economic benefits accruing person non-compliance delayed compliance; (i)conditions necessary deter future non-compliance another person; (j)satisfaction claim investors regarding payment money due delivery securities them; (k)any enforcement action taken applicant violation; (l)any factors necessary facts circumstances case. 9'

In [22]:
sj = 'CS company is awesome code of conduct'

In [30]:
s = []
s.append(si)
s.append(sj)
sentences = s
k=[]
j=0
for i in s:
    k = i.split()
    sentences[j] = k
    j=j+1

In [2]:
%store -r X1w2v
%store -r X2w2v

In [31]:
#Vector value calculation for regulations
X1 = X1w2v  #Vector values
X2 = X2w2v   #Tokens
X0 = []
val1 = []
for i in sentences:
    val = [0] * 100
    c = 0
    for j in i:
        p=0
        while(p<len(X2)):
            if(j == X2[p]):
                break
            p=p+1
        val2 = X1[p]
        for ct in range(0, len(val)): 
            val1.append(val[ct] + val2[ct])
        val = val1
        val1 = []
        c=c+1
    d = []
    for l in val:
        l = l/c
        d.append(l)
    val = d
    X0.append(val)

In [32]:
#Cosine similarity

from numpy import dot
from numpy.linalg import norm

In [33]:
#Cluster0
cos0 = []
i=0
while(i<len(X0)-1):
    j=len(X0)-1
    while(j>i):
        cos_sim = dot(X0[i], X0[j])/(norm(X0[i])*norm(X0[j]))
        cos0.append(cos_sim)
        j=j-1
    i=i+1
cos0

[0.9973332098020811]