In [None]:
import pandas as pd

In [None]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/gdrive')


Mounted at /gdrive


In [None]:
filepath = "/gdrive/MyDrive/info_159_Project/ap4"

In [None]:
dev = pd.read_csv(filepath + "/dev.txt", sep = "\t", header=None)
dev

Unnamed: 0,0,1,2
0,802,high,---------------------- Forwarded by Vince J Ka...
1,793,low,We need to get something for Ina for Admin Ass...
2,859,medium,Where have you been? What type of camera was it??
3,612,low,We do not keep inventory schedules on any of t...
4,47,medium,"hey jason, how is the new job? shanna and I wo..."
...,...,...,...
195,86,medium,"Yeah, the question is waht kind of good partie..."
196,73,low,---------------------- Forwarded by Vince J Ka...
197,690,medium,Here is the latest contact list. I added a cou...
198,34,medium,We want to have an internal conference call at...


In [None]:
test = pd.read_csv(filepath + "/test.txt", sep = "\t", header=None)
test

Unnamed: 0,0,1,2
0,565,low,Thanks! DF
1,94,low,Beavy and I changed the following deals for Ju...
2,186,low,Sean and Diana do not recognize this deal. The...
3,599,low,---------------------- Forwarded by Eric Bass/...
4,564,medium,"Hi, Elise. Sorry to bother you, but when Kali ..."
...,...,...,...
195,783,low,"Lindy, You are correct they are physical plant..."
196,822,medium,"Thanks for the attached deal information, Shou..."
197,868,medium,"Tom, The system does not take my approval of t..."
198,583,low,--------------------- Forwarded by Darron C Gi...


In [None]:
train = pd.read_csv(filepath +  "/train.txt", sep = "\t", header=None)
train

Unnamed: 0,0,1,2
0,853,low,http://gasfundy.corp.enron.com/gas/framework/d...
1,837,low,---------------------- Forwarded by V Charles ...
2,281,medium,I just tried to call you. What is your parents...
3,576,low,---------------------- Forwarded by Vince J Ka...
4,209,low,---------------------- Forwarded by Vince J Ka...
...,...,...,...
595,246,medium,I just wanted to let you know that I got out U...
596,760,low,FYI Vince ---------------------- Forwarded by ...
597,998,medium,Here it is. Let me know if you have any questi...
598,609,low,Kim is taking care of that invoice. Thanks


In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm

In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])
        
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[1].lstrip().rstrip()
            text = cols[2]

            X.append(text)
            
            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)
                    
    return X, Y, orig_Y

In [None]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY
        
        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY
        
        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):
        
        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape

        
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                # params for coefficience, print the param.
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model
        
    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        #print("preds", preds)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]
        
        preds=np.array(preds)
            
        for data_point in range(len(preds[0])):
            
    
            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot
    

    def prediction(self):
      counts=Counter()
      preds=[None]*(len(self.ordinal_values)-1)
      for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
          preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]
        
      preds=np.array(preds)
      predictions = []
            
      for data_point in range(len(preds[0])):
            
    
          ordinal_preds=np.zeros(len(self.ordinal_values))
          for ordinal in range(len(self.ordinal_values)-1):
              if ordinal == 0:
                  ordinal_preds[ordinal]=1-preds[ordinal][data_point]
              else:
                  ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

          ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

          prediction=np.argmax(ordinal_preds) #Returns the indices of the maximum values along an axis
          predictions.append(prediction)
          counts[prediction]+=1
      return counts, predictions
      


In [None]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1
            
    return feats

In [None]:
def feature1(text):
  feats = {}
  words = nltk.word_tokenize(text)
  for i in range(len(words)-1):
      word1 = words[i].lower()
      word2 = words[i + 1].lower()
      feats[word1 + " " + word2] = 1
  return feats

In [None]:
feature1("this is a test")

{'a test': 1, 'is a': 1, 'this is': 1}

In [None]:
high = ["asap", "password", "suggestion", "think", "userid", "can"]
medium = ["looking", "send", "reply", "forward", "let"]
low = ["call", "no", "haha", "fyi", "note", "pls", "thanks", "link", "www"]

In [None]:
def feature2(text):
  feats = {}
  words = nltk.word_tokenize(text)
  for word in words:
    word = word.lower()
    if word in high:
      if "high" in feats:
        feats["high"] = feats["high"] + 1
      else:
        feats["high"] = 1
    if word in medium:
      if "medium" in feats:
        feats["medium"] = feats["medium"] + 1
      else:
        feats["meidum"] = 1
    if word in low:
      if "low" in feats:
        feats["low"] = feats["low"] + 1
      else:
        feats["low"] = 1
  return feats

In [None]:
high = ["asap", "password", "suggestion", "think", "userid", "can"]
medium = ["looking", "send", "reply", "forward", "let"]
#low = ["call", "no", "haha", "fyi", "note", "pls", "thanks", "link", "www"]

In [None]:
def feature3(text):
  feats = {}
  words = nltk.word_tokenize(text)
  for word in words:
    if word in high:
      feats["high"] = 1
    if word in medium:
      feats["medium"] = 1
    else:
      feats["low"] = 1

In [None]:
def generate_N_grams(text,ngram=1):
  words=[word for word in text.split(" ")]  
  temp=zip(*[words[i:] for i in range(0,ngram)])
  ans=[' '.join(ngram) for ngram in temp]
  return ans

In [None]:
def time_featurize(text):
    feats = {}
    feats['bias_term']=1
    words = nltk.word_tokenize(text.lower())
    timesensitive_uni = ["asap","now","deadline","now","urgent"]
    timesensitive_bi = ["done by","have by","send by","right now"]
    for word in words:
      if word in timesensitive_uni:
        feats['timesensitive'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in timesensitive_bi:
        feats['timesensitive'] = 1      
    return feats

In [None]:
def tech_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    tech_uni = ["password","login","system","urgent"]
    tech_bi = ["log in","user id","forgot password","system update","doesn't work"]
    for word in words:
      if word in tech_uni:
        feats['techissue'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in tech_bi:
        feats['techissue'] = 1      
    return feats

In [None]:
def prof_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    prof_uni = ["case","deal","client","trade","contract"]
    prof_bi = ["could you"]
    for word in words:
      if word in prof_uni:
        feats['prof'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in prof_bi:
        feats['prof'] = 1      
    return feats

In [None]:
def question_featurize(text):
    feats = {}
    
    words = nltk.word_tokenize(text.lower())
    ques_uni = ["?","help","favor","what","when","know","how","why","where"]
    ques_bi = ["could you","do you","what is","can you","is that"]
    for word in words:
      if word in ques_uni:
        feats['question'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in ques_bi:
        feats['question'] = 1      
    return feats

In [None]:
def opinions_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    op_bi = ["have suggestions","your opinion","your thoughts","your suggestion","offer suggestion"]

    op_tri = ["have any suggestions","what's your thought","what you think","thoughts on"]
    for bi in generate_N_grams(text.lower(),2):
      if bi in op_bi:
        feats['opinion'] = 1
    for tri in generate_N_grams(text.lower(),3):
      if tri in op_tri:
        feats['opinion'] = 1      
    return feats

In [None]:
def action_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    action_uni = ["send","receive","copy","respond","reply"]
    for word in words:
      if word in action_uni:
        feats['action'] = 1    
    return feats

In [None]:
def casual_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    casual_uni = ["haha","lol","lmao","wtf","wth","hell","fuck","damn","sorta"]
    casual_bi = ["sort of","what's up"]
    for word in words:
      if word in casual_uni:
        feats['casual'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in casual_bi:
        feats['casual'] = 1      
    return feats

In [None]:
def personal_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    personal_uni = ["reunion","how's"]
    personal_bi = ["hang out","how's life","catch up","get together"]
    for word in words:
      if word in personal_uni:
        feats['personal'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in personal_bi:
        feats['personal'] = 1      
    return feats

In [None]:
def low_featurize(text):
    feats = {}
  
    words = nltk.word_tokenize(text.lower())
    low_uni = ["print","text","fyi","file","resend"]
    low_bi = ["call me","good job","my comment"]
    for word in words:
      if word in low_uni:
        feats['low'] = 1
    for bi in generate_N_grams(text.lower(),2):
      if bi in low_bi:
        feats['low'] = 1      
    return feats

In [None]:
def medium_featurize(text):
    feats = {}

    words = nltk.word_tokenize(text.lower())
    med_5 = ["unless anyone has other opinions","looking forward to your response"]
    med_7 = ["let me know if there's any question","looking forward to hearing back from you"]
    med_11 = ["if you have any other question, reply to this email"]
    for five in generate_N_grams(text.lower(),5):
      if five in med_5:
        feats['med'] = 1   
    for seven in generate_N_grams(text.lower(),7):
      if seven in med_7:
        feats['med'] = 1
    for eleven in generate_N_grams(text.lower(),11):
      if eleven in med_11:
        feats['med'] = 1           
    return feats

In [None]:
# def feature3(text):
#     feats = {}
#     feats['bias_term']=1
#     words = nltk.word_tokenize(text.lower())
#     # timesensitive = ["asap","now","finish by","done by","send by","deadline","right now"]
#     trigrams = nltk.trigrams(words)
#     for trigram in trigrams:
#       feats[] = 1
            
#     return feats

In [None]:
# trigrams = nltk.trigrams()

In [None]:
# def feature3(text):
#   return None

In [None]:
def combiner_function(text):

    # Here the `all_feats` dict should contain the features -- the key should be the feature name, 
    # and the value is the feature value.  See `simple_featurize` for an example.
    # binary_bow_featurize,feature1,feature2,time_featurize,tech_featurize,prof_featurize,question_featurize,opinions_featurize,action_featurize,casual_featurize,personal_featurize,low_featurize,medium_featurize
    
  all_feats={}
  for feature in [binary_bow_featurize,feature2,question_featurize]:
    all_feats.update(feature(text))
  return all_feats

In [None]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [None]:
# def run(trainingFile, devFile, testFile, ordinal_values):


#     trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
#     devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
#     testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)
    
#     simple_classifier = OrdinalClassifier(ordinal_values, combiner_function, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
#     simple_classifier.train()
#     accuracy=simple_classifier.test()

#     lower, upper=confidence_intervals(accuracy, len(devY[0]), .95)
#     print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def run(trainingFile, devFile, testFile, ordinal_values):


    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)
    #print(len(orig_testY))
    simple_classifier = OrdinalClassifier(ordinal_values, combiner_function, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

    #print(orig_testY)

    #confusion matrix

    pred_counts, pred_list = simple_classifier.prediction()

    df = pd.DataFrame(orig_testY)
    
    print(pred_counts)
    print(orig_testY)
    print(type(orig_testY[0]), type(pred_list[0]))


    # cm = confusion_matrix(orig_testY, pred_list)
    # print("Confusion Matrix\n")
    # cm_df = pd.DataFrame(cm, index = [0, 1, 2], columns = [0, 1, 2])
    
    # #plot
    # plt.figure(figsize = (5, 4))
    # sns.heatmap(cm_df, annot = True)
    # plt.title('Confusion Matrix')
    # plt.ylabel('Actual')
    # plt.xlabel("prediction")
    # plt.show()

  


    print('0 for orig_testY:', orig_testY.count("low"))
    print('1 for orig_testY:', orig_testY.count("medium"))
    print('2 for orig_testY:', orig_testY.count("high"))
    # print(pred_counts)
    # return pred_list

In [None]:
# a = np.array(["l", "m", "h"])
# type(a)

In [None]:
gid=23
trainingFile =  "/gdrive/MyDrive/info 159 Project/ap4/train.txt"
devFile = "/gdrive/MyDrive/info 159 Project/ap4/dev.txt"
testFile = "/gdrive/MyDrive/info 159 Project/ap4/test.txt"
    
# ordinal values must be in order *as strings* from smallest to largest, e.g.:
# ordinal_values=["G", "PG", "PG-13", "R"]

ordinal_values=["low", "medium", "high"]

run(trainingFile, devFile, testFile, ordinal_values)

Test accuracy for best dev model: 0.825, 95% CIs: [0.772 0.878]

Counter({0: 152, 1: 34, 2: 14})
['low', 'low', 'low', 'low', 'medium', 'low', 'high', 'medium', 'low', 'low', 'medium', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium', 'high', 'low', 'low', 'low', 'low', 'low', 'medium', 'low', 'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'medium', 'low', 'high', 'medium', 'medium', 'low', 'low', 'low', 'medium', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium', 'low', 'low', 'low', 'medium', 'low', 'low', 'low', 'medium', 'medium', 'medium', 'low', 'low', 'low', 'low', 'low', 'high', 'low', 'low', 'medium', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium', 'low', 'high', 'low', 'low', 'low'

In [None]:
orig_trainY = ["low", "high", "medium", "low"]

In [None]:
for i in range(len(orig_trainY)):
      if orig_trainY[i] == "low":
        orig_trainY[i] = 0
      elif orig_trainY[i] == "medium":
        orig_trainY[i] = 1
      else:
        orig_trainY[i] = 2

In [None]:
type(orig_trainY[0])

int

In our dataset, there are some texts that include rhetorical questions (e.g."How many times do I have to say EMAIL IT?"). The true testY labeled them all as low because they are not actually asking a question, but the model predicted it as medium. It leads to biased in our model because the model cannot tell the difference between rhetorical questions and non-rhetorical questions, so it will categorized it into different label. Also, the text that is not English can also caused biases to our model, the model cannot tell whether the text is English or not, so it's harder for it to categorized. 

I think our dataset is very unbalance because the total length of the training dataset is 200, but over 50% of those are categorized as low, and only a small percentage of datasets are categorized as meidum or high. Since there too many low categories, it creates more noise to our dataset which will result in many mischaracterized label for low categories. I think our dataset is a good strategy for oversampling because we have too much low categories of in our dataset, so it might result in overfitting when we run our model. Using oversampling, we can transformed the minority dataset (e.g. medium and high) in our case, to have more examples of the dataset.

Moreover, we can use class weight to improve the imbalance dataset, setting weight for three of our categories. We can apply smaller weight to high category because it has the most dataset, and apply bigger weight to medium and low dataset. So we can balance our datasets. 