## Feature highlighting

In [1]:
from joblib import load, dump

#### Break pipe

In [2]:
pipe = load("final-LR-pipe.joblib")

pipe.named_steps

{'preprocessor': Preproc(),
 'vectorizer': TfidfVectorizer(min_df=6, ngram_range=(1, 3)),
 'selector': SelectPercentile(percentile=26,
                  score_func=<function chi2 at 0x000001B928805AF0>),
 'classifier': LogisticRegression(C=5, solver='liblinear')}

In [3]:
preproc = pipe["preprocessor"]
# dump(preproc, "pipe_broken/preproc.joblib")

In [4]:
vec = pipe["vectorizer"]
# dump(vec, "pipe_broken/vectorizer.joblib")

In [5]:
sel = pipe["selector"]
# dump(sel, "pipe_broken/selector.joblib")

In [6]:
clas = pipe["classifier"]
# dump(clas, "pipe_broken/classifier.joblib")

#### Actual feature stuff

In [7]:
support = sel.get_support()

scored_words = zip(clas.coef_[0], vec.get_feature_names_out()[support])

In [11]:
import math
score_mapping = {}

for coef, word in scored_words:
    score_mapping.update({word: (pow(math.e, -coef), pow(math.e, coef))})       # negative, then positive
    # score_mapping[word] = (pow(math.e, coef), pow(math.e, -coef))

In [12]:
score_mapping

{'abandoned': (5.287904285118154, 0.18911083599117298),
 'abbreviations': (1.6457301875736339, 0.6076330175812963),
 'abdul': (1.4552399775027962, 0.6871718860527789),
 'ability relate': (0.4346520233317583, 2.3006910041155537),
 'ability teach': (1.5055439832687887, 0.6642117474567778),
 'ability teach material': (0.762415836516405, 1.3116201842935917),
 'ability teach not': (0.7796306286609388, 1.2826586889198524),
 'able': (0.0701250679050754, 14.26023574556316),
 'able an': (0.8410495534133033, 1.1889905843735538),
 'able answer': (4.475195777924251, 0.22345391120829006),
 'able apply': (4.625587187493768, 0.21618876900725312),
 'able figure': (2.612965409491146, 0.38270694145727024),
 'able good': (0.5329678297032402, 1.8762858549207486),
 'able hear': (0.7814965413627321, 1.2795961940615286),
 'able help': (0.7950754443569574, 1.2577422772863798),
 'able relate': (0.6831160811348957, 1.4638800455972998),
 'able teach': (4.800069150874615, 0.2083303320365273),
 'able teach materia

Given a comment, to highlight features, do the following:

* preprocess comment
* *somehow* determine which features are present in comment
* use score mapping and determined sentiment to pick features to highlight
  * highlight top *n* features from comment
* perform highlighting

In [13]:
import pandas as pd
import nltk

In [14]:
def combineNGrams(features: list):      # take list of tuples
    feats_comb = []
    
    for feat in features:
        new_feat = ""
    
        for word in feat:
            new_feat += word + " "      # add all words in ngram separated by space
    
        new_feat = new_feat.strip()     # get rid of last space
        feats_comb.append(new_feat)     # add to running list
    
    return feats_comb

In [15]:
def getImportantFeatures(docs: pd.DataFrame, n=5):      # only pass in single comment at a time
    # PREREQ: pipeline called pipe and (at least) preprocessor split from pipe and called preproc
    #         score_mapping created
    all_imp_feats = []

    for i in range(len(docs.index)):
        sentiment = pipe.predict(docs)[i]               # will need comment's sentiment later
        new_comm = preproc.preproc(docs)[i].split()     # preprocess comment, split to ensure ngrams work nicely

        feats_uni = list(nltk.ngrams(new_comm, 1))      # obtain all unigrams, bigrams, and trigrams in comment (as list)
        feats_bi  = list(nltk.ngrams(new_comm, 2))
        feats_tri = list(nltk.ngrams(new_comm, 3))

        # combine all ngrams, then combine into a single list
        all_feats = combineNGrams(feats_uni) + combineNGrams(feats_bi) + combineNGrams(feats_tri)

        comm_scores = {}        # dictionary holding scores of all features in comment
        for feat in all_feats:
            score = score_mapping.get(feat)
            
            if score == None:                       # feature not found, default to impossible value
                score = -1
            else:
                score = score[sentiment]            # get positive score if sentiment is positive, negative if negative
            
            comm_scores[feat] = score
        
        vals = list(comm_scores.values())
        most_imp_vals = sorted(vals, reverse=True)[:n]          # get highest scores (values from dictionary)

        most_imp_feats = []
        keys = list(comm_scores.keys())
        for val in most_imp_vals:
            most_imp_feats.append(keys[vals.index(val)])        # get corresponding feature names (keys from dictionary)
        
        all_imp_feats.append(most_imp_feats)
        
    return all_imp_feats

In [16]:
docs = [
    "He has his own grading criteria, which may throw you off. Tests are divided into weekly quiz, which you can redo them for better grade. PAs are difficult and mimir grading provides limited info, but he do provide fast and helpful feedback via office hour or mail. I was too late when I realized that, so contact him quickly if PA is hurting you.",
    "The other ratings saying he's a nice person are kind of confusing, because when I took his class, he constantly seemed angry, and would frequently yell at and belittle students who asked questions he thought they should already know the answers to. (Without explaining the answer, of course. Female students seemed to get the worst of it.)"
]

df = pd.DataFrame(data={"firstName": ["David", "Philip"], "lastName": ["Kebo", "Ritchey"], "comment": docs})

In [18]:
yooo_pog = getImportantFeatures(df, n=10)
yooo_pog

[['helpful',
  'provides',
  'better grade',
  'fast helpful',
  'office',
  'grading criteria',
  'weekly',
  'quickly',
  'info',
  'better'],
 ['worst',
  'confusing',
  'already',
  'constantly',
  'asked',
  'know answers',
  'yell',
  'asked questions',
  'angry',
  'frequently']]

In [35]:
comm = preproc.preproc(df)[0]
feats = yooo_pog[0]
print(comm)
print(feats)

his grading criteria which throw tests divided weekly which redo better grade pas difficult mimir grading provides limited info provide fast helpful feedback via office hour mail late realized contact quickly pa hurting
['helpful', 'provides', 'better grade', 'fast helpful', 'office', 'grading criteria', 'weekly', 'quickly', 'info', 'better']


### Indices in preprocessed comment to highlight

In [36]:
locs = []
for feat in feats:
    loc = comm.find(feat)
    length = len(feat)
    locs.append((loc, loc+length))

locs

[(141, 148),
 (106, 114),
 (65, 77),
 (136, 148),
 (162, 168),
 (4, 20),
 (47, 53),
 (201, 208),
 (123, 127),
 (65, 71)]

Go back to preprocessed comment to check that locations were correct

In [37]:
words = []

for loc in locs:
    word = ""
    for i in range(loc[0], loc[1]):
        word += comm[i]
    words.append(word)

words

['helpful',
 'provides',
 'better grade',
 'fast helpful',
 'office',
 'grading criteria',
 'weekly',
 'quickly',
 'info',
 'better']