# Classifier Model

In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import pandas as pd
import operator
import nltk
import math
from scipy.stats import norm
from sklearn.metrics import plot_confusion_matrix
from pandas import option_context

In [None]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])
        
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[1].lstrip().rstrip()
            text = cols[2]

            X.append(text)
            
            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)
                    
    return X, Y, orig_Y



In [None]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)
        self.log_reg = None

        self.trainY=trainY
        self.devY=devY
        self.testY=testY
        
        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY
        
        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):
        
        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape

        
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model
            self.log_reg = best_model
        
    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]
        
        preds=np.array(preds)

            
        for data_point in range(len(preds[0])):
            
    
            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1


        return cor/tot
    def prediction(self):
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]
        
        preds=np.array(preds)
        predictions = []

            
        for data_point in range(len(preds[0])):
            
    
            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            predictions.append(prediction)
            counts[prediction]+=1
        return counts, predictions
    def printWeights(self, n=10):

        reverse_vocab=[None]*len(self.log_reg.coef_[0])
        for k in self.feature_vocab:
            reverse_vocab[self.feature_vocab[k]]=k

        # binary
        if len(self.log_reg.classes_) == 2:
              weights=self.log_reg.coef_[0]

              cat=self.log_reg.classes_[1]
              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

              cat=self.log_reg.classes_[0]
              for feature, weight in list(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1)))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

        # multiclass
        else:
          for i, cat in enumerate(self.log_reg.classes_):

              weights=self.log_reg.coef_[i]

              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

In [None]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1
            
    return feats

In [None]:
def feature1(text):
  feats = {}
  words = nltk.word_tokenize(text)
  for i in range(len(words)-1):
      word1 = words[i].lower()
      word2 = words[i + 1].lower()
      feats[word1 + " " + word2] = 1
  return feats

This feature implements the idea of bigrams. A single word may not be able to express the meaning precisely. Each key is a combination of two consecutive words.

In [None]:
high = ["asap", "password", "suggestion", "think", "userid", "can", "comments", "login"]
medium = ["looking", "send", "respond", "forward", "let", "problems"]
low = ["call", "no", "haha", "fyi", "note", "pls", "thanks", "link", "www"]
def feature2(text):
  feats = {}
  words = nltk.word_tokenize(text)
  for word in words:
    word = word.lower()
    if word in high:
      if "high" in feats:
        feats["high_f"] = feats["high"] + 1
      else:
        feats["high_f"] = 1
    if word in medium:
      if "medium" in feats:
        feats["medium_f"] = feats["medium"] + 1
      else:
        feats["meidum_f"] = 1
    if word in low:
      if "low" in feats:
        feats["low_f"] = feats["low"] + 1
      else:
        feats["low_f"] = 1
  return feats

feature2 is looking at the words that appear in the dataset, if the word appear, it will add 1 to the specific category(e.g. high, medium, low). 

In [None]:
def generate_N_grams(text,ngram=1):
  words=[word for word in text.split(" ")]  
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

This feature corresponds to the "time sensitive" point we have among our high urgency points. Words/phrases are all high frequency time related words that we discovered during annotation process.

In [None]:
def time_featurize(text):
    feats = {}
    feats['bias_term']=1
    words = nltk.word_tokenize(text.lower())
    timesensitive_uni = ["asap","now","deadline","now","urgent"]
    timesensitive_bi = ["done by","have by","send by","right now"]
    for word in words:
      if word in timesensitive_uni:
        feats['timesensitive_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in timesensitive_bi:
        feats['timesensitive_'] = 1      
    return feats

This feature corresponds to the "technology related question" point we have among our high urgency points. Words/phrases are all high frequency technology question related words that we discovered during annotation process. 

In [None]:
def tech_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    tech_uni = ["password","login","system","urgent"]
    tech_bi = ["log in","user id","forgot password","system update","doesn't work"]
    for word in words:
      if word in tech_uni:
        feats['techissue_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in tech_bi:
        feats['techissue_'] = 1      
    return feats

This feature corresponds to the “professional/business related topic” point we have among our high and medium urgency points. Words/phrases are all high frequency professional topic related words that we discovered during annotation process. 

In [None]:
def prof_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    prof_uni = ["case","deal","client","trade","contract"]
    prof_bi = ["could you"]
    for word in words:
      if word in prof_uni:
        feats['prof_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in prof_bi:
        feats['prof_'] = 1      
    return feats

This feature corresponds to the "asking question" point we have among our high/medium urgency points. Words/pharses are all high frequency question related words that we discovered during annotation process. 

In [None]:
def question_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    ques_uni = ["?","help","favor","what","when","know","how","why","where"]
    ques_bi = ["could you","do you","what is","can you","is that"]
    for word in words:
      if word in ques_uni:
        feats['question_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in ques_bi:
        feats['question_'] = 1      
    return feats

This feature corresponds to the "asking for people's opinion" point we have among our high urgency points. Words/pharses are all high frequency opinion related words that we discovered during annotation process. 

In [None]:
def opinions_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    op_bi = ["have suggestions","your opinion","your thoughts","your suggestion","offer suggestion"]

    op_tri = ["have any suggestions","what's your thought","what you think","thoughts on"]
    for bi in generate_N_grams(text.lower(),2):
      if bi in op_bi:
        feats['opinion_'] = 1
    for tri in generate_N_grams(text.lower(),3):
      if tri in op_tri:
        feats['opinion_'] = 1      
    return feats

This feature corresponds to the "action required" point we have among our high/medium urgency points. Words are all high frequency time related words that we discovered during annotation process. 

In [None]:
def action_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    action_uni = ["send","receive","copy","respond","reply"]
    for word in words:
      if word in action_uni:
        feats['action_'] = 1    
    return feats

This feature corresponds to the "casual tone" point we have among our low urgency points. Words/phrases are all high frequency ones that indicates casuality that we discovered during annotation process. 

In [None]:
def casual_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    casual_uni = ["haha","lol","lmao","wtf","wth","hell","fuck","damn","sorta"]
    casual_bi = ["sort of","what's up"]
    for word in words:
      if word in casual_uni:
        feats['casual_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in casual_bi:
        feats['casual_'] = 1      
    return feats


This feature corresponds to the "personal events(not professional)" point we have among our medium/low urgency points. Words are all high frequency personal topic related words that we discovered during annotation process. 


In [None]:
def personal_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    personal_uni = ["reunion","how's"]
    personal_bi = ["hang out","how's life","catch up","get together"]
    for word in words:
      if word in personal_uni:
        feats['personal_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in personal_bi:
        feats['personal_'] = 1      
    return feats

This feature covers other important words and phrases that indicate low urgency when annotating.

In [None]:
def low_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    low_uni = ["print","text","fyi","file","resend"]
    low_bi = ["call me","good job","my comment"]
    for word in words:
      if word in low_uni:
        feats['low_'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in low_bi:
        feats['low_'] = 1      
    return feats

This feature covers other important words and phrases that indicate medium urgency when annotating.

In [None]:
def medium_featurize(text):
    feats = {}

    words = nltk.word_tokenize(text.lower())
    med_5 = ["unless anyone has other opinions","looking forward to your response"]
    med_7 = ["let me know if there's any question","looking forward to hearing back from you"]
    med_11 = ["if you have any other question, reply to this email"]
    for five in generate_N_grams(text.lower(),5):
      if five in med_5:
        feats['med_'] = 1   
    for seven in generate_N_grams(text.lower(),7):
      if seven in med_7:
        feats['med_'] = 1
    for eleven in generate_N_grams(text.lower(),11):
      if eleven in med_11:
        feats['med_'] = 1           
    return feats

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def feature4(text):
  count = CountVectorizer()
  lower_text = text.lower()
  word_count=count.fit_transform([lower_text])
  tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
  tfidf_transformer.fit(word_count)
  df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names(),columns=["idf_weights"])
  feats = df_idf.sort_values(by=['idf_weights']).to_dict()['idf_weights']
  return feats

In [None]:
def combiner_function(text):

    # Here the `all_feats` dict should contain the features -- the key should be the feature name, 
    # and the value is the feature value.  See `simple_featurize` for an example.
    # binary_bow_featurize,feature1,feature2,time_featurize,tech_featurize,prof_featurize,question_featurize,opinions_featurize,action_featurize,casual_featurize,personal_featurize,low_featurize,medium_featurize
    
  all_feats={}
  #for feature in [binary_bow_featurize,feature1,feature2, time_featurize,tech_featurize,prof_featurize,question_featurize,opinions_featurize,action_featurize,casual_featurize,personal_featurize,low_featurize,medium_featurize]:
  for feature in [binary_bow_featurize,question_featurize, feature2, time_featurize, medium_featurize, opinions_featurize, casual_featurize, personal_featurize]:#feature2, time_featurize,tech_featurize,prof_featurize,question_featurize,opinions_featurize,action_featurize,casual_featurize,personal_featurize,low_featurize,medium_featurize]:
  #for feature in [binary_bow_featurize,question_featurize, feature2]:
    all_feats.update(feature(text))
  return all_feats

In [None]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [None]:
def run(trainingFile, devFile, testFile, ordinal_values):


    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    trainX1, trainY1, orig_trainY1 =load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)
    
    simple_classifier = OrdinalClassifier(ordinal_values, combiner_function, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))
    return simple_classifier

In [None]:
gid=23
trainingFile = "/gdrive/MyDrive/info_159_Project/ap4/ap_data/%s/train.txt" % gid
devFile = "/gdrive/MyDrive/info_159_Project/ap4/ap_data/%s/dev.txt" % gid
testFile = "/gdrive/MyDrive/info_159_Project/ap4/ap_data/%s/test.txt" % gid
    
# ordinal values must be in order *as strings* from smallest to largest, e.g.:
# ordinal_values=["G", "PG", "PG-13", "R"]

ordinal_values=["low", "medium", "high"]

cl = run(trainingFile, devFile, testFile, ordinal_values)

Test accuracy for best dev model: 0.825, 95% CIs: [0.772 0.878]



# Analysis

### Analysis on Unbalanced Dataset

In [None]:
def run_predict(trainingFile, devFile, testFile, ordinal_values):


    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)
    
    simple_classifier = OrdinalClassifier(ordinal_values, combiner_function, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    pred_count, pred_list=simple_classifier.prediction()

    return pred_count, pred_list, orig_testY


In [None]:
pred_count, pred_list, orig_testY = run_predict(trainingFile, devFile, testFile, ordinal_values)

In [None]:
pred_count # 0 is low, 1 is medium, 2 is high

Counter({0: 151, 1: 34, 2: 15})

In [None]:
pred_list.count(0), pred_list.count(1), pred_list.count(2)

(151, 34, 15)

In [None]:
orig_testY.count("low"), orig_testY.count("medium"), orig_testY.count("high")

(150, 30, 20)

I think our dataset is very unbalance because the total length of the training dataset is 200, but over 50% of those are categorized as low, and only a small percentage of datasets are categorized as meidum or high. Since there are too many low categories, our models will create more noise which will result in many miscategorized label for low categories. I think our dataset is a good strategy for oversampling because we have too many low categories in our dataset, so it might result in overfitting when we run our model. Using oversampling, we can transform the minority dataset (e.g. medium and high) in our case, to have more medium and high categories in our test set.

Moreover, we can use class weight to improve the imbalanced dataset, setting weight for three of our categories. We can apply smaller weight to the high category because it has the most data, and apply bigger weight to medium and low data. So we can balance our test set. 

### Analysis on Confusion Matrix and F1 Score

In [None]:
new_pred = []
for i in range(len(pred_list)):
  if pred_list[i] == 0:
    new_pred.append("low")
  elif pred_list[i] == 1:
    new_pred.append("medium")
  elif pred_list[i] == 2:
    new_pred.append("high")


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(orig_testY, new_pred)

array([[  9,   9,   2],
       [  4, 135,  11],
       [  2,   7,  21]])

In [None]:
# confusion matrix 
y_actu = pd.Series(orig_testY, name='Actual')
y_pred = pd.Series(new_pred, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
df_confusion

Predicted,high,low,medium
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high,9,9,2
low,4,135,11
medium,2,7,21


In [None]:
precision_low = 135/150
precision_low

0.9

In [None]:
precision_medium = 21/30 
precision_medium

0.7

In [None]:
precision_high = 9/20 
precision_high

0.45

In [None]:
recall_low = 135/151
recall_low

0.8940397350993378

In [None]:
recall_medium = 21/34
recall_medium

0.6176470588235294

In [None]:
recall_high = 9/15
recall_high

0.6

In [None]:
f1_low = (2 * precision_low * recall_low)/(precision_low + recall_low)
f1_low

0.8970099667774086

In [None]:
f1_medium = (2 * precision_medium * recall_medium)/(precision_medium + recall_medium)
f1_medium

0.65625

In [None]:
f1_high = (2 * precision_high * recall_high)/(precision_high + recall_high)
f1_high

0.5142857142857143

Based on the counts of each label in the true testY list, we noticed we had 150 low labels, 30 medium labels, and 20 high labels. This is an unbalanced class distribution, so only examining the accuracy of the model or the precision may not give us a sense of the model. We decided to calculate the F1 score which is a better calculation for the unbalanced classes model. For the low class, we got an F1 score around 0.89, 0.65 for the medium class and 0.51 for the high class. The model performed better for low class text since the F1 score for low class is pretty close to 1. It was doing ok for medium class but not as good as the low class. Based on the confusion matrix and F1 score, the model didn’t predict well on the high class. The model misclassified about 45% of the high class text into low class and 10% of the high class into medium class. It also misclassified about 23% of medium class text into low class and 6% of medium class into high class. For low class text, the model misclassifies about 7% into medium class and 2% into high class. Based on these result, we figured out that our model often mislabeled the high class text into low class. We may probably need to look into our guideline to make the edge between high and low class clearer. This may also be because our model isn’t able to detect some of the important features in high class and so it is making the model perform poorly. We may need to look into it and improve our model in the future for the high class portion.   

 

### Analysis on category boundaries

In [None]:
test_data = pd.read_csv("/gdrive/MyDrive/info_159_Project/ap4/ap_data/23/test.txt", sep = "\t", header = None)
test_data

Unnamed: 0,0,1,2
0,565,low,Thanks! DF
1,94,low,Beavy and I changed the following deals for July and August. We took all the August activity to 0. This is how these deals should be balanced Deal 318562 with 318686. And 318691 with 318682. I'll look at the economics in the morning.
2,186,low,"Sean and Diana do not recognize this deal. They tried to check with the brokers, but couldn't get a hold of anyone. I'll let you know first thing in the morning whether or not this is good and what the deal number is. Kate"
3,599,low,---------------------- Forwarded by Eric Bass/HOU/ECT on 02/21/2001 08:13 AM --------------------------- Please join me and the Global Accounting leadership team in congratulating the following individuals on their promotions to: To Senior Tax Analyst Emily Allwardt (International Tax) Leon Branom (EBS Tax) Shanna Husser (EES Tax) there is my sweety pea Shilpa Mane (Corporate/London Tax) Todd Richards (Corp Tax) Michelle Thompson (Corp Tax)
4,564,medium,"Hi, Elise. Sorry to bother you, but when Kali was unwrapping her presents last night, there were two without cards or tags (probably lost in transit!!). We narrowed it down to Laura and Sofi. So that we can get the thank you notes correct, did Laura give Kali a Skipper doll or a bath set in a blue bag? Thanks for your help. Susan P.S.- We ended up selling 131 boxes at the booth on Saturday."
...,...,...,...
195,783,low,"Lindy, You are correct they are physical plant receipt points. The Pecos Diamond plant is the straddle plant located on the Atoka Lateral. Thanks DS ----------------------------------- I don't think any of these are straddle plants. (The plant we discussed previously, I think, was the Peco Diamond which is a straddle plant.) Linda, am I correct on these 3 below? Sorry it took me so long to respond. Maybe you already have answers to this. -----Original Message----- From: Powers, Ken Sent: Wednesday, May 16, 2001 1:41 PM To: Donoho, Lindy Subject: Plant question I'm afraid I asked you this question before, but I can't find where I wrote down your answer. Sorry. Anyway, are any of these plants straddle plants, or are they ""real"" physical receipt points. These are the plants we're receiving gas from in May. 10703 GPM Artesia 60151 Amoco Abo 1190 Sid Richardson Keystone Ken Powers (402) 398-7065"
196,822,medium,"Thanks for the attached deal information, Shouan. Looking at the data I feel confident that, once we are past the first day of the month, the gas daily options curveshift is valuing correctly. However, I am still concerned with the valuation from 10/31. Is there anyway to see same breakdown on 11/2 and 11/3 for 10/31's position. If you could send me that information in the same format I think we may have an answer for the changed deal value we saw fall out of my trader's book on 11/1. Susan"
197,868,medium,"Tom, The system does not take my approval of this request."
198,583,low,--------------------- Forwarded by Darron C Giron/HOU/ECT on 12/04/2000 04:13 PM --------------------------- -----Original Message----- - inspir.jpg


In [None]:
np.where((np.array(orig_testY) == np.array(new_pred)) == False) # index of the uncorrect text

(array([  2,   4,   6,  19,  26,  30,  35,  36,  38,  42,  52,  57,  58,
         62,  69,  86,  92,  97,  98, 104, 112, 115, 116, 117, 118, 119,
        129, 141, 143, 158, 163, 187, 195, 196, 197]),)

In [None]:
test_data.iloc[112][2]

'How many times do I have to say EMAIL IT? ---------------------- Forwarded by Kay Mann/Corp/Enron on 02/09/2001 01:57 PM --------------------------- Kay: Just to clarify. The ""blacklined changes"" in the fax I sent to you come directly from the document that you sent to us yesterday. We will let you know as soon as we hear anything further. Thanks, Karen Karen S. Way Piper Marbury Rudnick & Wolfe 203 N. LaSalle Chicago, Illinois 60601 email: karen.way@piperrudnick.com (ph) 312-368-2152 (fax) 312-630-6347 > -----Original Message----- > From: Way, Karen S. - CHI > Sent: Friday, February 09, 2001 11:45 AM > To: \'kay.mann@enron.com\' > Cc: Shindler, Donald A. - CHI; Townsend, Christopher J. - CHI > Subject: Titan-Schaffer Option > > Kay: > > This is to confirm that I have faxed to you at 713-646-3491 the following > documents: i) a copy of the cover fax-letter to Mr. Al Freehill, attorney > for the Schaffers; and 2) the blacklined changes to the Schaffer-Titan > Option for Mr. Freehill\

In [None]:
np.array(new_pred)[195], np.array(orig_testY)[195]

('medium', 'low')

In order to see how well our model performs, we got the index of the texts that the model predicted wrong and tried to look into the specific texts. We noticed that the model was able to detect work-related and profession-related text by counting the frequency of the work-related words. However, sometimes the work-related email can be informational based and no questions are asked in the email and there were no need to respond, but the high frequency of these words may cause the model think this is a high urgency email. We listed a few bullet points mention about this work-related features in both high and medium categories in the guideline. However, now we felt that we want to determine the urgency of the email, so we changed to whether or not the email is asking questions. Because of this, we may probably need to remove work-related features in our guideline and make whether or not the text contains question to have higher weight.

Another problem is the forwarded emails. Some of the emails contain response and forwarded emails. In the guideline, we mentioned that when we tried to determine the urgency of an email that contained forwarded emails, we should prioritize the urgency of the response email. If there is no response email, then we should look into the most recent forwarded email. We use “---------Forwarded” to separate response and forwarded email. This can be recognized by a human. However, our model doesn’t have the ability to differentiate between response email and forwarded email. It takes all the text as input and trains the whole text. So sometimes when the response is low urgency but forwarded email is high urgency, our model will mislabeled it to be a high urgency email. We may need to improve our model to help it separate these email or we can clean our original email text to contain only the text we want to model to tain and remove all the redundant texts. 



### Analysis on Biases Model

In our dataset, there are some texts that include rhetorical questions (e.g."How many times do I have to say EMAIL IT?"). The true testY labeled them all as low because they are not actually asking a question, but the model predicted it as medium. It leads to bias in our model because the model cannot tell the difference between rhetorical questions and non-rhetorical questions, so it will categorized it into the incorrect label. Also, text that is not English can also cause bias in our model. Our model cannot detect if the text is English, so it is harder for it to be categorized. 

### Analysis on Key Features

In [None]:
cl.printWeights(n=25)

1	2.244	?
1	1.145	question_
1	0.827	contract
1	0.774	tomorrow
1	0.759	password
1	0.759	please
1	0.674	week
1	0.585	deal
1	0.582	after
1	0.563	need
1	0.556	05:47
1	0.553	soon
1	0.553	respond
1	0.531	3:00
1	0.529	information
1	0.520	could
1	0.520	thanks
1	0.515	lc
1	0.511	sometime
1	0.487	do
1	0.487	why
1	0.485	agreement
1	0.482	@
1	0.481	copy
1	0.478	me

0	-0.702	!
0	-0.602	think
0	-0.592	he
0	-0.551	:
0	-0.519	enron
0	-0.508	all
0	-0.498	.
0	-0.488	will
0	-0.458	guys
0	-0.456	go
0	-0.452	scott
0	-0.449	&
0	-0.445	at
0	-0.441	lunch
0	-0.440	are
0	-0.436	that
0	-0.432	file
0	-0.430	had
0	-0.426	pl
0	-0.421	phone
0	-0.418	am
0	-0.417	's
0	-0.407	here
0	-0.395	no
0	-0.394	they




*   According to the weights of the features, we can tell that the most important features leading to highs are "?", "question", "contract", "please", "tomorrow", "password". These features can generally be categorized into time sensitive, work-related, question, professional tone which relate to the high emergency points in our guideline. However, there are also features that have relatively high weights such as "05:47", "3:00", "lc", "@" that are not so informative when we are judging as human. This senario might be due to the relative small training data we have, so the model recognized some coincidents in our data. There are also common words like "do", "me" in that appears on the top of the feature list. This means, we should remove some stop words in our model. 
*   We didn't remove punctuations because we believed they indicate whether the sentense is a question or not and that is an important point in judging the urgency to respond of emails. However, we should consider adding some regularization to our feature, so that "?" is not the single most important estimator of urgency in our model. 
*   The features that have the smallest weights further exposed our model's reliance in stop words. While some of the features such as "phone", "guys" do make sense as they relates to "request to call", and "casual/unprofessional tone". The others are all stop words or other words that we didn't consider as strong indicators of low urgency when labeling manually. The lack of some of the features that we consider important such as "call","fyi","haha" might be the reason behind our difficulty in improving the model accurary.
*   Surprisingly, none of the key features are the features that appears on the top or bottom most list are bigram features. We think this is because there are too many noises in our model, so the really important bigram phrases are lost. 
*   Only one of the features that we defined based on the annotation guideline appears to be in the key feature list. We think the possible reasons are too many noises/bad features in model, and not enough accurate domain knowledge included in those features.  






### Analysis On Common Model Mistakes

In [None]:
def load_ordinal_data_new(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    id = []
    for ordinal in ordering:
        Y.append([])
        
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[1].lstrip().rstrip()
            text = cols[2]

            X.append(text)
            id.append(idd)
            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)
                    
    return X, Y, orig_Y, id

In [None]:
def analyze(classifier):
    
    probs=classifier.log_reg.predict_proba(classifier.devX)
    
    predicts=classifier.log_reg.predict(classifier.devX)
    
    classes={}
    for idx, lab in enumerate(classifier.log_reg.classes_):
        classes[lab]=idx

    mistakes={}
    for i in range(len(probs)):
        if predicts[i] != classifier.devY[0][i]:
            predicted_lab_idx=classes[predicts[i]]
            mistakes[i]=probs[i][predicted_lab_idx]

    frame=[]
    sorted_x = sorted(mistakes.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x:
        devX, devY, orig_devY, devids=load_ordinal_data_new(devFile, ordinal_values)
        idd=devids[k]
        text=devX[k]
        frame.append([idd, v, classifier.orig_devY[k], predicts[k], text])

    df=pd.DataFrame(frame, columns=["id", "P(predicted class confidence)", "Human label", "Prediction", "Text"])
    pd.set_option('display.max_colwidth', None)

    with option_context('display.max_colwidth', 400):
        display(df.head(n=20))

In [None]:
analyze(cl)

Unnamed: 0,id,P(predicted class confidence),Human label,Prediction,Text
0,9,0.998674,high,0,"""How do we get the right focus on support for Calgary? Mark Taylor wanted to hold up transferring confirm responsibility to Calgary, so we still need to provide excellent service to them. While I can appreciate that Diane is out, that should actually be a red flag to us to pay even closer attention to the Calgary stuff. Every screw up looks big to them, and they can't understand how mistakes c..."
1,303,0.997984,medium,0,"""---------------------- Forwarded by Drew Fossum/ET&S/Enron on 04/03/2001 04:47 PM --------------------------- This is to confirm your attendance for the Friday, April 20, 2001, Executive Forum to be hosted by The Office of the Chairman. The Forum will begin at 3:00 p.m. and ends at 4:30 p.m in the Enron Building 50M. If you have any additional questions, please feel free to give me a call. Th..."
2,469,0.99254,high,0,"""These two deals, 249262 & 565833 are changed for February out. I will tell the risk team that all new deals at NGPL/Nipsco should be at NIPS/NGPL - 9260. Is this correct? PL""\n"
3,137,0.992022,medium,0,don't know yet probably thurs. are you still in tonight - i think probably 2 rows\n
4,39,0.991358,medium,0,"""Steve filled in the detail for me on the LG&E issue mentioned in the Law bullets this morning. A valve was left open between an LG&E facility and Northern for an approximate 3 month period (I'm not sure how recent). LG&E's gas loss data indicates that the amount of gas that inadvertently got into Northern was approx. 180,000 MMBtu, with a value of about $500,000. Rockey and Steve have talked ..."
5,407,0.986219,high,0,Any word. I've been waiting for a while on this issue. Kevin\n
6,802,0.98544,high,0,"""---------------------- Forwarded by Vince J Kaminski/HOU/ECT on 04/05/2000 04:58 PM --------------------------- ---------------------- Forwarded by Vince J Kaminski/HOU/ECT on 03/13/2000 02:03 PM --------------------------- Hi Vince: I haven't talked to you in a while and wanted to touch base with you on a couple of issues. 1. Is there someone at Enron who would be interested in speaking to o..."
7,627,0.9837,medium,0,I received your riddle from Amy Yueh and my answer is NOTHING! Let me know if I am right. Andy\n
8,828,0.982605,medium,0,"""Lynn -- FYI, here is my draft of the fuel filing for TW. This is very much a work in progress and we're having a meeting tomorrow to discuss logistics. We've invited Darrell and Richard to the meeting; if there is anyone else you think we should include just let me know.""\n"
9,579,0.981455,medium,0,Professor Ronn: I received an e-mail from Meg Brooks in the admissions office stating that I would be hearing from them this week. I hope things work out and I can be contributing to and learning from the program next year. Thanks again for all your help. Hope to hear from you soon. Ben\n



*   First of all, we noticed that among the 20 most wrongly predicted labels, only one of them is wrongly predicted as medium, while all other 19 are wrongly predicted as low and none of them are wrongly predicted as high. This observation makes sense considering our unbalanced class size in our dataset.
*   Another common characteristic of these wrongly classified emails is they include forward emails. Forward email is a tricky part in our labeling task. And we established a guideline defining when should we judge base on the forwarded email and when should we decide base on the original email. However, this domain knowledge was not incorportated into our model through feature engineering, resulting our model to make the systematic problem where instead of looking only at texts in front of or after the forward division line, the model analyze the entire email as a whole and that impacted the model judgement. 
*   Additionally, the messages all contain a lot of the stop words that are wrongly classified as important features indicating low urgency. It is hard to discover any other features if not setting aside the stop words in the messages. 
*   Among these messages, there are actually lots of words that were picked up by our features. But because those features didn't make it to the top feature list. They are linked with small coefficients. And that causes the model's failure in recognizing them and correctly classifing the email. 
*   Overall, our model has decent accuracy on low features because of the unbalanced datasize, there might exist some bias in the model and more information on low data points in traning set. However, less accuracy for high and medium urgency to respond emails because of not enough training data and unclear/complicated judgement guidelines when annotating manually. 
*   Moving forward, the model could benefit from removing stop words, none relavent punctuations, better and clearer annotated data, back probagation, model ensemble, better features with more precise domain knowledge, and most importantly a more balanced dataset. 



