In [29]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy import displacy
import textacy
import re
from tqdm import tqdm
from tqdm import tqdm_gui
import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.load('en_core_web_lg')

In [39]:
data = pd.read_excel('l1_12_sentiment_modified.xlsx',sheet_name="BusinessDevelopmentSupport")

In [40]:
def pos_regex_matches(doc, pattern):
    # standardize and transform the regular expression pattern...
    pattern = re.sub(r"\s", "", pattern)
    pattern = re.sub(r"<([A-Z]+)\|([A-Z]+)>", r"( (\1|\2))", pattern)
    pattern = re.sub(r"<([A-Z]+)>", r"( \1)", pattern)

    tags = " " + " ".join(tok.pos_ for tok in doc)

    for m in re.finditer(pattern, tags):
        yield doc[tags[0 : m.start()].count(" ") : tags[0 : m.end()].count(" ")]

In [41]:
def get_all_noun_chunks(texts, pattern, lemma=False):
    all_results = set()
    for i in tqdm(range(len(texts))):
        text = texts[i]
        if type(text)==str:
            doc = nlp(text)
            lists = pos_regex_matches(doc, pattern)
            for list in lists:
                if lemma:
                    all_results.add(' '.join(['%s' % w.lemma_ for w in list.as_doc()]))
                else:
                    all_results.add(list.text)
    return all_results

In [42]:
all_adj_noun = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<ADJ>*<NOUN>+')
all_noun = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<NOUN>+')
all_adj_noun_lemma = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<ADJ>*<NOUN>+',lemma=True)
all_noun_lemma = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<NOUN>+',lemma=True)

100%|██████████| 200/200 [00:01<00:00, 132.13it/s]
100%|██████████| 200/200 [00:01<00:00, 136.78it/s]
100%|██████████| 200/200 [00:01<00:00, 117.58it/s]
100%|██████████| 200/200 [00:01<00:00, 132.93it/s]


In [43]:
all_adj_noun_lemma

{'account',
 'account statement',
 'account statment request',
 'acitivite',
 'active account app',
 'activity',
 'advantage',
 'advisor portal',
 'amc',
 'amcs',
 'anything',
 'app',
 'application',
 'application form',
 'applicationn',
 'assistence',
 'aum',
 'aurangabad',
 'awareness',
 'awareness programe',
 'awerence traning program',
 'awereness presention',
 'bad manager',
 'belgaum branch service',
 'benefit',
 'big event',
 'bikaner branch',
 'birla',
 'birla sunlife',
 'branch',
 'branch rm visit',
 'brokerage statment',
 'bsl',
 'bsl dividend yield',
 'bsl equity advantage fund',
 'bsl equity fund',
 'bsl frontline equity fund',
 'bsl mf',
 'bsl mf partner programe',
 'bsl office',
 'bsl tax relief',
 'bsl trb programe',
 'bslamc',
 'bsll',
 'bslmf',
 'bslmf end',
 'bslmf fund',
 'business',
 'bussiness improvement',
 'calculation',
 'call',
 'cam',
 'certification',
 'cheque',
 'client',
 'co -',
 'colour',
 'commmon application form',
 'common application form',
 'common m

In [44]:
all_noun_lemma_2 = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<NOUN>',lemma=True)

100%|██████████| 200/200 [00:01<00:00, 119.13it/s]


In [45]:
all_noun_lemma_2

{'-',
 'account',
 'acitivite',
 'activity',
 'address',
 'advantage',
 'advisor',
 'alert',
 'amc',
 'amcs',
 'anything',
 'app',
 'application',
 'applicationn',
 'area',
 'assistence',
 'aum',
 'aurangabad',
 'awarene',
 'awareness',
 'awerence',
 'awereness',
 'basis',
 'belgaum',
 'benefit',
 'bikaner',
 'birla',
 'blackrock',
 'branch',
 'brokerage',
 'bsl',
 'bslamc',
 'bsll',
 'bslmf',
 'business',
 'bussiness',
 'calculation',
 'call',
 'cam',
 'certification',
 'change',
 'chart',
 'cheque',
 'chunawala',
 'class',
 'client',
 'co',
 'colour',
 'communication',
 'company',
 'conatct',
 'concall',
 'condition',
 'conference',
 'conversation',
 'copy',
 'crash',
 'customer',
 'day',
 'debt',
 'dehradhun',
 'detail',
 'dhanbad',
 'dinner',
 'dispatch',
 'distributor',
 'dividend',
 'dsp',
 'dynamic',
 'easyness',
 'education',
 'email',
 'end',
 'engagement',
 'equity',
 'event',
 'everything',
 'experience',
 'explanation',
 'face',
 'facebook',
 'facility',
 'fact',
 'facthshe

In [75]:
dicts = {
    "activity":["acitivite"],
    "amc":["amcs"],
    "application":["applicationn","app","apps"],
    "awareness":["awarene","awareness","awerence",],
    "ifa":["ifas"],
    "information":["infomration"]
}

In [77]:
temp_dict = {}

for key, values in dicts.items():
    for value in values:
        temp_dict[value]=key

In [79]:
def _make_dict_cleaning(s, w_dict):
    s = w_dict.get(s, s)
    return s

In [86]:
t_data = data.L1_Verbatim.values

In [87]:
t_data = [' '.join([_make_dict_cleaning(i,temp_dict) for i in t.split()]) for t in t_data]

In [88]:
temp_dict

{'acitivite': 'activity',
 'amcs': 'amc',
 'applicationn': 'application',
 'app': 'application',
 'apps': 'application',
 'awarene': 'awareness',
 'awareness': 'awareness',
 'awerence': 'awareness',
 'ifas': 'ifa',
 'infomration': 'information'}

In [89]:
t_data

['and i am not getting the factsheets on time if you send me one factsheet twice in a month then it would be better .',
 '( 4 ) training programme of birla is also good .',
 'i am not receiving the fund updates on time nor i am getting the nfo forms if you arrange to send the forms then it would be better and also if you send the one pager of the good performing schemes it would be convenient for me to make understand regarding the same to my clients , where as other amc are providing the same .',
 'and if you send me some sip and common application forms then its would be better .',
 'if he visit twice in a week then it would be very convenient for me because it is not possible for me to visit frequently to krishna nagar to collect the forms',
 'so i told him to speed post the forms and soa on my home address so that i can deliver the soa on time to the investors .',
 'suggestion : the trainings programs should be arranged regularly , atleast once is a quarter .',
 'the topics related

In [93]:
all_noun_lemma_1 = get_all_noun_chunks(t_data,pattern = r'<NOUN>',lemma=True)
print(len(all_noun_lemma_2))
all_noun_lemma_2 = get_all_noun_chunks(data.L1_Verbatim.values,pattern = r'<NOUN>',lemma=True)
print(len(all_noun_lemma_2))

100%|██████████| 200/200 [00:01<00:00, 126.23it/s]
  6%|▋         | 13/200 [00:00<00:01, 124.44it/s]

281


100%|██████████| 200/200 [00:01<00:00, 125.23it/s]

281





In [96]:
all_noun_lemma_2.difference(all_noun_lemma_1)

{'amcs', 'app', 'applicationn', 'awerence', 'ifas', 'infomration'}

In [91]:
all_noun_lemma_2

{'-',
 'account',
 'acitivite',
 'activity',
 'address',
 'advantage',
 'advisor',
 'alert',
 'amc',
 'anything',
 'application',
 'area',
 'assistence',
 'aum',
 'aurangabad',
 'awarene',
 'awareness',
 'awereness',
 'basis',
 'belgaum',
 'benefit',
 'bikaner',
 'birla',
 'blackrock',
 'branch',
 'brokerage',
 'bsl',
 'bslamc',
 'bsll',
 'bslmf',
 'business',
 'bussiness',
 'calculation',
 'call',
 'cam',
 'certification',
 'change',
 'chart',
 'cheque',
 'chunawala',
 'class',
 'client',
 'co',
 'colour',
 'communication',
 'company',
 'conatct',
 'concall',
 'condition',
 'conference',
 'conversation',
 'copy',
 'crash',
 'customer',
 'day',
 'debt',
 'dehradhun',
 'detail',
 'dhanbad',
 'dinner',
 'dispatch',
 'distributor',
 'dividend',
 'dsp',
 'dynamic',
 'easyness',
 'education',
 'email',
 'end',
 'engagement',
 'equity',
 'event',
 'everything',
 'experience',
 'explanation',
 'face',
 'facebook',
 'facility',
 'fact',
 'facthsheet',
 'factsheet',
 'feedback',
 'file',
 'fing