In [2]:
"""
 - Converts text into numeric vector of distinct words
 - Related documents from a product line become a single corpus
   [If only one document is available then that would be the corpus as well]
 - When converting text to numeric data, create a dictionary of the numeric value and the corresponding
   word and word-synonyms (using enchant)
 - Use the corpus for computing TF-IDF scores
 - The top 5 words with the highest score will likely be in the main subject of the corpus
   So manually select the appropriate word that identifies the topic of the corpus from the top words
 - Starting from the topic word phrase, find phrases that are surrounding that phrase (word window classification) 
   [extract all the valid, non-repeating phrases] get all valid phrases that can be candidate features.
 - Manually mark all features and non-features based on the experts FM.
 - Then apply Logistic regression processing and get the best fit model with the least misclassification rate.
 - Compute the relations score table based on the heuristics mentioned in the paper
 - Any feature with a relation score greater than 0.9 if connected with another feature with a relation score 
   greater than 0.9 is considered a Mandatory relation. Otherwise it is considered as an Optional relation.

"""

In [20]:
# import library
import numpy as np
import re
import enchant
import itertools
import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from itertools import chain
from nltk.corpus import wordnet

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [7]:
class StatisticalFMEvaluator:
    
    corpus = [] # stores the document array in numeric representation
    docs = [] # string array that stores plain text of each document
    corpusname = '' # for creating unique output file names for different case studies
    corpus_dict = {} # dictionary of all distinct word in the corpus
    wordCounter = 0 # counter for tracking the total number of distinct words in the corpus
    doc_bow_arr = [] # array with document bag of words format
    doc_tags_arr = [] # list to store lists of post tags with the corresponding word for every document
    features_lst = [] # list of all possible feature terms in the corpus
    tags_lst = [] # list of all tags
    
    eng_dict = enchant.Dict("en_US")
    
    def __init__(self, doc_arr, name):
        self.docs = doc_arr
        self.corpusname = name
        
                
    def updateDictionary(self, wlist):
        corpus_counter = []
        # Search in dictionary
        for w in wlist:
            if w and not w.isspace():
                if w not in self.corpus_dict:
                    # search in dict values
                    foundKey = self.getKeyByValue(w)
                    if foundKey == '-1':
                        # not in the dictionary, then it is a new word
                        # add it
                        self.wordCounter = self.wordCounter + 1
                        self.corpus_dict[w] = (self.wordCounter, [w]) 
                        corpus_counter.append(self.wordCounter)
                    else:
                        old_value = self.corpus_dict[foundKey] # this is a tuple
                        tmpLst = set(old_value[1])
                        tmpLst.add(w)
                        new_value = old_value[0], list(tmpLst)
                        self.corpus_dict[foundKey] = new_value 
                        corpus_counter.append(old_value[0])                     

                else:
                    corpus_counter.append(self.corpus_dict[w][0])
                        
        return corpus_counter
    
    def getFeatureTermsList(self, topic, ngram):
        selectedTags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
        phraseTokenCnt = 0
        phrases = []
        tags = []
        for tagTuples in self.doc_tags_arr:  
            if tagTuples != "#d":
                for (w,t) in tagTuples:
                    if len(w)>3 and t in selectedTags and phraseTokenCnt <= 1 + 2*ngram:
                        phrases.append(w)
                        tags.append(t)
                        phraseTokenCnt = phraseTokenCnt + 1
                    else:
                        if len(phrases) > 0:
                            self.features_lst.append(" ".join(phrases))
                            self.tags_lst.append(",".join(tags))
                            phrases = [] # clear the list
                            tags = [] # clean the tags
                            phraseTokenCnt = 0 # reset counter
        featureTerms = [ph for ph in self.features_lst if topic in ph]
        # collect 10 chained terms connected to each of the phrases in the above list
        indexes = [self.features_lst.index(p) for p in featureTerms]
        return self.getRanges(indexes)
    
    def getRanges(self, idx):
        allIdx = []
        for ind in idx:
            allIdx.append(list(range(ind-5, ind+5)))
        return list(itertools.chain.from_iterable(allIdx))
    
    def getDocTagsArray(self):
        return self.doc_tags_arr
    
    def getAllFeaturesLst(self):
        return self.features_lst 
    
    def getAllFeaturesTagsLst(self):
        return self.tags_lst                         
            
    def getAlphaString(self, wordString):
        return re.sub('[^A-Za-z]+', '', wordString, 0, re.I)  
    
    def getKeyByValue(self, wval):
        opkey = "-1"
        listOfItems = self.corpus_dict.items()
        for k, (keyid, ws) in listOfItems:
            if wval in ws or self.isSynonym(wval,ws):
                return k
        return opkey 
    
    def isSynonym(self, wrd, wrdLst):
        synonyms = wordnet.synsets(wrd)
        lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
        for _w in wrdLst:
            syn = wordnet.synsets(_w)
            lem = set(chain.from_iterable([word.lemma_names() for word in syn]))
            res = lemmas.intersection(lem)
            return bool(res)
        
    def printCorpus(self):
        for _arr in self.corpus:
            print(_arr)  
            
    def printDictionary(self):
        print(self.corpus_dict)
        
    def getString(self, _idx):
        return [k for k,(_k,_v) in self.corpus_dict.items() if _k == _idx][0]
    
    def getCorpusDictionary(self):
        return self.corpus_dict
    
    def getDocBowArr(self):
        return self.doc_bow_arr
        
    def computeTfIdfScores(self):
        docArr = []
        score_dict = {}
        
        for sen_ls in self.corpus[0]: 
            if sen_ls != "#d":
                docArr.append(' '.join(self.getString(x) for x in sen_ls))
                self.doc_bow_arr = docArr  
                vectorizer = TfidfVectorizer(stop_words = 'english')
                vectors = vectorizer.fit_transform(docArr)
                feature_names = vectorizer.get_feature_names()
                scores = zip(feature_names, np.asarray(vectors.sum(axis=0)).ravel())
                sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
                score_dict = {}
                for item in sorted_scores:
                    score_dict[item[0]] = item[1]
        return score_dict
        
    def generateCorpus(self):        
        for doc in self.docs:
            print("####################### DOCUMENT ####################")
            # local variable for storing the POS tags tuple
            tags_tuple_lst = []
            # tokenize and get all distinct words with length > 2 and no special characters
            # convert to lower case
            wordArr = []
            
            doc_lwr = doc.lower()
            # sentence tokenize
            for sentence in sent_tokenize(doc_lwr):                
                # word tokenize
                wordsList = []
                flag_final = False
                clean_word = ''
                
                # compute pos tag
                words = word_tokenize(sentence)
                tags_tuple_lst.append(pos_tag(words))
                
                for word in words:
                    if len(word) > 2:
                        if not flag_final:
                            word = clean_word+word                            

                        clean_word = self.getAlphaString(word)
                        if clean_word:
                            # check if the spelling is correct, if not correct it
                            if self.eng_dict.check(clean_word):
                                wordsList.append(clean_word)
                                flag_final = True
                            else:
                                word_sugg = self.eng_dict.suggest(clean_word)
                                if word_sugg and len(word_sugg[0]) > len(clean_word):
                                    wordsList.append(word_sugg[0])
                                    flag_final = True
                                else:
                                    flag_final = False
                            
                if not flag_final and clean_word: # take the middle suggested word
                    sugg = self.eng_dict.suggest(clean_word)
                    if sugg:
                        wordsList.append(sugg[len(sugg)//2]) # floor division
                dictKey = self.updateDictionary(wordsList)   
                if dictKey:
                    wordArr.append(dictKey)
            # update corpus
            if len(self.corpus) > 0:
                self.corpus.append("#d") # end of document marker
            self.corpus.append(wordArr)
            if len(self.doc_tags_arr) > 0:
                self.doc_tags_arr.append("#d") # end of document marker
            self.doc_tags_arr.append(list(itertools.chain.from_iterable(tags_tuple_lst)))     
            
    def getSimilarMeaningListOfWords(self, words):
        # check if the words are in the dictionary
        wordSeq = []
        for w in words:
            if w not in self.corpus_dict:
                # search in dict values
                foundKey = self.getKeyByValue(w)
            else:
                foundKey = w
                
            if foundKey != '-1':
                listOfItems = self.corpus_dict.items()
                wordSeq.append([k for k, (keyid, ws) in listOfItems if foundKey==k])
                
        # just pick the tags of the words from the tags array
        resultTags = []
        for _wrd in wordSeq:
            aTgs = set([_t for (_w,_t) in self.doc_tags_arr[0] if _w == _wrd])
            # select the one which starts with 'N' or 'V'
            selectedTags = [i for i in list(aTgs) if i.startswith('N') or i.startswith('V')]
            if selectedTags:
                resultTags.append(selectedTags[0])
        return resultTags
    
    def getFeatureIndex(self, phrase):
        ph_wrds = phrase.split(' ')
        intRes = [len(list(set(ph_wrds).intersection(set(feature.split(' '))))) for idx, feature in enumerate(self.features_lst)]
        max_value = max(intRes)
        scoreIdx = intRes.index(max_value)
        if scoreIdx >= 0:
            return scoreIdx # return the first occurence
        return -1

In [None]:
class RegressionAnalysis:
    featureSet = [] # list of possible features terms
    allProbFeatures = [] # List of all possible feature terms
    allFeaturesTags = [] # List of all possible feature terms POS tags
    regression_data = pd.DataFrame() # Dataframe to store the regression analysis input data
    data_columns = [] # List of columns used in the regression data
    predictor_coefficients = {} # dictionary that stores all then significant variable coefficients
    
    def __init__(self, features, allFeatures, allFeatureTags):
        self.featureSet = features
        self.allProbFeatures = allFeatures
        self.allFeaturesTags = allFeatureTags
        
    def computeRegressionData(self):
        # get all distinct tag names for column data
        distinct_tags = []
        for _atag in self.allFeaturesTags:
            distinct_tags.append(_atag.split(","))
        column_names = set(itertools.chain.from_iterable(distinct_tags))
        column_names = [c.replace('PRP$','PRPD').replace('WP$','WPD').replace('$', 'POS') for c in column_names]
        _columns = ['Feature'] + list(set(column_names))
        self.data_columns = _columns
        print(_columns)

        # mark feature and non-feature
        statdf = pd.DataFrame(columns=_columns)
        statIdx = 0
        for _combo in self.allFeaturesTags:
            statdf.loc[statIdx] = [0] + [0]*(len(_columns)-1)
            for _tag in _combo.split(","):
                if _tag == '$':
                    _tag = 'POS'
                if _tag == 'PRP$':
                    _tag = 'PRPD'
                if _tag == 'WP$':
                    _tag = 'WPD'               
                
                statdf.loc[statIdx][_tag] = statdf.loc[statIdx][_tag] + 1
            if statIdx in self.featureSet:
                statdf.loc[statIdx]['Feature'] = 1
            
            statIdx = statIdx + 1        
        # print(statdf)
        self.regression_data = statdf
        return statdf
    
    def modelGeneration_Coefficients(self, fileName):
        reg_data = pd.read_csv(fileName)
        # print(reg_data.head())
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(reg_data.drop(['Feature'],axis=1),reg_data['Feature'], test_size=0.2,random_state=3)
        reg = LogisticRegression(solver='lbfgs')
        pd.options.display.max_rows = None
        reg.fit(data_input_train, data_target_train)
        coeff = reg.coef_
        
        model_coefficients = {}
        i = 1
                           
        for _coeff in coeff[0]:
            model_coefficients[self.data_columns[i]] = _coeff
            i = i + 1
        self.predictor_coefficients = model_coefficients
        # print(data_input_test.head())
        print(self.predictor_coefficients)
     
    def PredictFeatureProbability(self, pos_tag_seq):
        predicted_sum = 0
        allowedTags = [_t for _t in self.data_columns if _t != 'Feature']
        for _tag in pos_tag_seq:
            if _tag in allowedTags:
                predicted_sum = predicted_sum + self.predictor_coefficients[_tag]
            else:
                predicted_sum = predicted_sum + 0
        predicted_prob = self.inv_logit(predicted_sum)
        return predicted_prob                              
     
    def inv_logit(self, p):
        return 1 / (1 + math.exp(-p))  
    
    # common function
    def isSeqIn(self, a, b):
        subseq = list((map(lambda x: b[x:x + len(a)] == a, range(len(b) - len(a) + 1))))
        if True in subseq:
            return subseq.index(True)
        else:
            # scan the words and check if the word contains the feature word
            for w in a:
                plurals = [i for i,p in enumerate(b) if w in p]
                if plurals:
                    return plurals[0] # return the first or default                
        return -1

In [None]:
class RelationsPredictor:
    featureSet = [] # list of possible features terms
    rootFeatureWord = '' # The word that MUST be in the root feature term
    resultDistanceScores = [] # List storing the distance score values
    
    def __init__(self, topic, allFeatures):
        self.rootFeatureWord = topic
        self.featureSet = allFeatures
        
    def distance(self, lst, K):       
        lst = np.asarray(lst) 
        diffArr = np.abs(lst - K)
        idx = (np.abs(lst - K)).argmin() 
        return diffArr[idx] 

    def computeDistanceScores(self):
        # find index of all phrases with the root feature word
        idxs = [i for i, s in enumerate(self.featureSet) if self.rootFeatureWord in s]
        
        for ind, val in enumerate(self.featureSet):            
            # Find the closest root feature term
            self.resultDistanceScores.append(self.distance(idxs, ind))
            
        # Distance scores
        dist_inp_minmax = []
        amin, amax = min(self.resultDistanceScores), max(self.resultDistanceScores)
        for i, val in enumerate(self.resultDistanceScores):
            dist_inp_minmax.append(1- ((val-amin) / (amax-amin)))
        return dist_inp_minmax        