In [1]:
from docx import Document
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt


## Parsing data from Doc

In [2]:
rules={1:'termination|terminate',
       2:'payment|pay|bill|billing',
       3:'solicitation|solicit',
       4:'insurance',
       5:'remedy'
      }

In [3]:
def doc_tolist(doc,pdf_path,rules):
    df_list=[]
    for i,d in enumerate(doc):
        for r_idx in range(1,6):
            res=re.search(rules[r_idx],d.lower())
            if res and len(d)>30:
                df_list.append([i,d,pdf_path,rules[r_idx]])
    return df_list

def get_doc_list(dl,path):
#     print(path) 
    document=None
    document = Document(path)
    doc=[x.text for x in document.paragraphs if not x.text.isspace() and len(x.text)>0]
    rules={1:'termination|terminate',
       2:'payment|pay|bill|billing',
       3:'solicitation|solicit',
       4:'insurance',
       5:'remedy'
      }
    dl+=doc_tolist(doc,path,rules)
    return dl

In [4]:
dl=[]
root_path='../data/contract/docs'
for d_name in os.listdir(root_path):
    if d_name!='.DS_Store':
        temp_path=os.path.join(root_path,d_name)
        dl=get_doc_list(dl,temp_path)
df=pd.DataFrame(dl,columns=['index','content','doc','label'])

## Data processing

In [5]:
set(df.label)

{'insurance',
 'payment|pay|bill|billing',
 'remedy',
 'solicitation|solicit',
 'termination|terminate'}

In [6]:
plain_text_insurance=' '.join(df[df.label=='insurance'].content)
plain_text_pay=' '.join(df[df.label=='payment|pay|bill|billing'].content)
plain_text_remedy=' '.join(df[df.label=='remedy'].content)
plain_text_solicitation=' '.join(df[df.label=='solicitation|solicit'].content)
plain_text_termination=' '.join(df[df.label=='termination|terminate'].content)

In [None]:
plain_text=[plain_text_insurance,plain_text_pay,\
            plain_text_remedy,plain_text_solicitation,plain_text_termination]

### data pipeline

In [205]:
plain_text=[replain_text=[re.sub("[0-9]+|[,\.\-%#\\“\”'\/\$\)\(]|\t",'',x.lower()) \
                          for x in plain_text].sub("[0-9]+|[,\.\-%#\\“\”'\/\$\)\(]|\t",'',x.lower()) for x in plain_text]

## Model selection

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.stem.snowball import SnowballStemmer
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem.porter import *
from nltk.stem import *

In [173]:
vectorizer = TfidfVectorizer(max_df=0.2,stop_words='english')
vectors = vectorizer.fit_transform(plain_text)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df_content = pd.DataFrame(denselist, columns=feature_names)

In [174]:
df_content.shape

(5, 1148)

In [175]:
stemmer = SnowballStemmer('english', ignore_stopwords=False)

class StemmedTfidfVectorizer(TfidfVectorizer):
    
    def __init__(self, stemmer, *args, **kwargs):
        super(StemmedTfidfVectorizer, self).__init__(*args, **kwargs)
        self.stemmer = stemmer
        
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (self.stemmer.stem(word) for word in analyzer(doc.replace('\n', ' ')))

In [182]:
vectorizer_stem_u = StemmedTfidfVectorizer(stop_words='english',stemmer=stemmer, sublinear_tf=True)
vectors= vectorizer_stem_u.fit_transform(plain_text)
feature_names = vectorizer_stem_u.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df_content_stem = pd.DataFrame(denselist, columns=feature_names)

In [183]:
df_content_stem

Unnamed: 0,abandon,abid,abil,abl,abovement,absenc,absent,absente,absolut,abstain,...,workplac,workweek,world,write,written,xi,xvi,xxxxx,year,york
0,0.0,0.0,0.0,0.0,0.0,0.032655,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034049,0.03831,0.0,0.0,0.0,0.027471,0.0
1,0.021612,0.021612,0.041284,0.033202,0.015821,0.0,0.010596,0.0,0.015821,0.015821,...,0.012764,0.015821,0.015821,0.036465,0.033898,0.0,0.0,0.015821,0.029437,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.030985,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.041217,0.024344,0.0,0.0,0.0,...,0.0,0.0,0.0,0.048868,0.066394,0.0,0.0,0.0,0.082826,0.0
4,0.015558,0.015558,0.0,0.0,0.0,0.021867,0.012915,0.019284,0.0,0.0,...,0.037127,0.0,0.0,0.042266,0.045319,0.019284,0.019284,0.0,0.048173,0.019284


### Model
loss=croos_entropy(logist(w*v/len(v)),y)

In [6]:
from gensim import corpora,similarities,models  
# import jieba  

In [7]:
stemmer = PorterStemmer()
corpora_documents = []  
plain_text_list=[x.split() for x in df.content]
for p in plain_text_list:
    corpora_documents.append([ stemmer.stem(w) for w in p])

In [9]:
dictionary = corpora.Dictionary(corpora_documents)  
corpus = [dictionary.doc2bow(text) for text in corpora_documents]  
tfidf_model = models.TfidfModel(corpus) 
corpus_tfidf = tfidf_model[corpus]  
similarity = similarities.Similarity('Similarity-tfidf-index', \
                                     corpus_tfidf, num_features=700)  

In [10]:
def compare_similarity(text):
    test_data_1 = text
    test_data_1=re.sub("[0-9]+|[,\.\-%#\\“\”'\/\$]|\)|\(|\t|$",' ',\
                       test_data_1.lower()).split()
    test_data_1=[stemmer.stem(x) for x in test_data_1]
    test_corpus_1 = dictionary.doc2bow(test_data_1)
    similarity.num_best = 5  
    test_corpus_tfidf_1=tfidf_model[test_corpus_1]
    return similarity[test_corpus_tfidf_1]

In [11]:
label_dict={0:'insurance',
           1:'payment',
           2:'remedy',
           3:'non-solicitation',
           4:'termination'}

In [None]:
for it, row in df.iterrows():
#     print(row.content,row.label)
    print(compare_similarity(row.content)[0],row.label)
    break

In [10]:
a='This agreement may be terminated immediately by TSI for poor performance, poor conduct, poor work habits or inability to work as a team member on the part of “Subcontractor” employees/consultants, as determined solely by TSI or TSI’s end client. In addition, this agreement may be terminated by TSI upon 5 days written notification by TSI or TSI’s end client without specific reason. Notwithstanding the above, it is TSI’s intent and desire for this agreement to run full term and be extended. termination|terminate'

In [11]:
a=a.lower()

In [None]:
test_data_1 = a
test_data_1=re.sub("[0-9]+|[,\.\-%#\\“\”'\/\$]|\)|\(|\t|$",' ',\
                       test_data_1.lower()).split()
test_data_1=[stemmer.stem(x) for x in test_data_1]
test_corpus_1 = dictionary.doc2bow(test_data_1)
test_corpus_tfidf_1=tfidf_model[test_corpus_1]
similarity[test_corpus_tfidf_1]

In [20]:
test_corpus_tfidf_1

[(2, 0.01987096278224971),
 (5, 0.07971343185561044),
 (6, 0.013474138621096083),
 (7, 0.05693109334891422),
 (8, 0.05961288834674913),
 (9, 0.07011514245336456),
 (10, 0.044187304662291184),
 (13, 0.05653219836959198),
 (14, 0.11095470241044654),
 (15, 0.08910377361071227),
 (17, 0.16183973859487163),
 (19, 0.02627645192172076),
 (20, 0.09019027741823409),
 (21, 0.15632767633908629),
 (22, 0.05622090070065206),
 (23, 0.17477623721177266),
 (24, 0.13194000541124246),
 (25, 0.027187750230281686),
 (26, 0.025476514469645443),
 (27, 0.06968918960034994),
 (28, 0.13787911546639992),
 (29, 0.11943055459371357),
 (30, 0.09784712198812737),
 (31, 0.004606680835567912),
 (32, 0.03484459480017497),
 (33, 0.029237016189472956),
 (34, 0.09019027741823409),
 (36, 0.524328711635318),
 (38, 0.17477623721177266),
 (39, 0.07593711292599305),
 (40, 0.09640808789724943),
 (41, 0.17477623721177266),
 (42, 0.04402274755206306),
 (43, 0.051541226024235366),
 (44, 0.0097501523836012),
 (45, 0.04713536811922