<a href="https://colab.research.google.com/github/sanosenx86/si-s-e-fivefivetwotwo/blob/lab2/5522_lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 1: A Simple Bayes Net: Naive Bayes

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
TweetUrl='https://github.com/aasiaeet/cse5522data/raw/master/db3_final_clean.csv'
tweet_dataframe=pd.read_csv(TweetUrl)

In [0]:
display(tweet_dataframe.shape)
tweet_dataframe.head()

(3697, 3)

Unnamed: 0,weight,tweet,label
0,1.0,it is very cold out want it to be warmer,-1
1,0.7698,dammmmmmm its pretty cold this morning burr lol,-1
2,0.6146,why does halsey have to be so far away think m...,-1
3,0.9356,dammit stop being so cold so can work out,-1
4,1.0,its too freakin cold,-1


In [0]:
# wordDict maps words to id
# X is the document-word matrix holding the presence/absence of words in each tweet
wordDict = {}
idCounter = 0
for i in range(tweet_dataframe.shape[0]):
  allWords = tweet_dataframe.iloc[i,1].split(" ")
  for word in allWords:
    if word not in wordDict:
      wordDict[word] = idCounter
      idCounter += 1
X = np.zeros((tweet_dataframe.shape[0], idCounter),dtype='float')

In [0]:
for i in range(tweet_dataframe.shape[0]):
  allWords = tweet_dataframe.iloc[i,1].split(" ")
  for word in allWords:
    X[i, wordDict[word]]  = 1

In [0]:
y = np.array(tweet_dataframe.iloc[:,2])
y[0:5]

array([-1, -1, -1, -1, -1])

In [0]:
w = np.array(tweet_dataframe.iloc[:,0])
w[0:5]

array([1.    , 0.7698, 0.6146, 0.9356, 1.    ])

In [0]:
class NaiveBayesClassfier:
  def __init__(self):
    self.logPriorNegative, self.logPriorPositive = 0.0, 0.0
    self.logProbWordAbsentGivenNegative, self.logProbWordAbsentGivenPositive, self.logProbWordPresentGivenNegative, self.logProbWordPresentGivenPositive = np.array([]), np.array([]), np.array([]), np.array([])
    self.is_fit=False
  # compute three distributions (four variables):
  def _compute_distros(self,x,y):
    # probWordGivenPositive: P(word|Sentiment = +ive)
    probWordGivenPositive=np.sum(x[y>=0,:],axis=0) #Sum each word (column) to count how many times each word shows up (in positive examples)
    probWordGivenPositive=probWordGivenPositive/np.sum(y>=0) #Divide by total number of (positive) examples to give distribution

    # probWordGivenNegative: P(word|Sentiment = -ive)
    probWordGivenNegative=np.sum(x[y<0,:],axis=0)
    probWordGivenNegative=probWordGivenNegative/np.sum(y<0)

    # priorPositive: P(Sentiment = +ive)
    priorPositive = np.sum(y>=0)/y.shape[0] #Number of positive examples vs. all examples
    # priorNegative: P(Sentiment = -ive)
    priorNegative = 1 - priorPositive
    #  (note these last two form one distribution)

    return probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative

  # compute the following:
  # logProbWordPresentGivenPositive
  # logProbWordAbsentGivenPositive
  # logProbWordPresentGivenNegative
  # logProbWordAbsentGivenNegative
  # logPriorPositive
  # logPriorNegative
  def _compute_logdistros(self,distros, min_prob):
      #Assume missing words are simply very rare
      #So, assign minimum probability to very small elements (e.g. 0 elements)
      distros=np.where(distros>=min_prob,distros,min_prob)
      #Also need to consider minimum probability for "not" distribution
      distros=np.where(distros<=(1-min_prob),distros,1-min_prob)
      return np.log(distros), np.log(1-distros)

  def train(self, x, y):
    min_prob = 1/y.shape[0] #Assume very rare words only appeared once
    probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = self._compute_distros(x, y)
    self.logProbWordPresentGivenPositive, self.logProbWordAbsentGivenPositive = self._compute_logdistros(probWordGivenPositive,min_prob)
    self.logProbWordPresentGivenNegative, self.logProbWordAbsentGivenNegative = self._compute_logdistros(probWordGivenNegative,min_prob)
    self.logPriorPositive, self.logPriorNegative = self._compute_logdistros(priorPositive,min_prob)
    # Did this work, or did you get an error?  (Read below.)
    # display(self.logProbWordPresentGivenPositive[0:5])
    # display(self.logProbWordAbsentGivenPositive[0:5])
    # display(self.logProbWordPresentGivenNegative[0:5])
    # display(self.logProbWordAbsentGivenNegative[0:5])
    # display(self.logPriorPositive, self.logPriorNegative)
    self.is_fit = True

  def classifyNBWithAbsent(self, words):
    # fill in function definition here
    if self.is_fit:
      pPos = np.dot(1-words, self.logProbWordAbsentGivenPositive) + np.dot(words, self.logProbWordPresentGivenPositive) + self.logPriorPositive
      pNeg = np.dot(1-words, self.logProbWordAbsentGivenNegative) + np.dot(words, self.logProbWordPresentGivenNegative) + self.logPriorNegative
      pred = pPos-pNeg
      return np.sign(pred), np.abs(pred)
    raise RuntimeError("Error: model has not been trained.")

  def classifyNBWithoutAbsent(self, words):
    # fill in function definition here
    if self.is_fit:
      pPos = np.dot(words, self.logProbWordPresentGivenPositive) + self.logPriorPositive
      pNeg = np.dot(words, self.logProbWordPresentGivenNegative) + self.logPriorNegative
      pred = pPos-pNeg
      return np.sign(pred), np.abs(pred)
    raise RuntimeError("Error: model has not been trained.")


In [0]:
def accuracy(pred, std):
  return np.sum(np.equal(pred, std))/float(len(std))

In [0]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state=42)
display(xTrain.shape, xTest.shape, yTrain.shape, yTest.shape)
model = NaiveBayesClassfier()
model.train(xTrain, yTrain)
print(accuracy((model.classifyNBWithAbsent(xTest))[0], yTest),accuracy((model.classifyNBWithoutAbsent(xTest))[0], yTest))

(2957, 5989)

(740, 5989)

(2957,)

(740,)

0.8162162162162162 0.827027027027027


In [0]:
accAbs, accNoAbs=np.array([]), np.array([])
for i in range(10):
  xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
  model.train(xTrain, yTrain)
  accAbs = np.append(accAbs, accuracy((model.classifyNBWithAbsent(xTest))[0], yTest))
  accNoAbs = np.append(accNoAbs, accuracy((model.classifyNBWithoutAbsent(xTest))[0], yTest))

print("Mean accuracy of model with absent words are {0}, the standard deviation is {1}.".format(np.average(accAbs), np.std(accAbs)))
print("Mean accuracy of model without absent words are {0}, the standard deviation is {1}.".format(np.average(accNoAbs), np.std(accNoAbs)))

Mean accuracy of model with absent words are 0.8289189189189189, the standard deviation is 0.01078036252712386.
Mean accuracy of model without absent words are 0.829864864864865, the standard deviation is 0.011694460614226987.


The difference between two models is not consistent and the effect size is fairly small, so I cannot tell which is better in general. However, after several trials, on the same train and test sets, the model without considering the absent word usually have a better performance than the model counting the absent words. (difference is usually on ~0.0005 level.)

#Part 2: Include weight factor in Naive Bayes model.

In [0]:
class WeightedNaiveBayesClassifier(NaiveBayesClassfier):
  def __init__(self):
    super(WeightedNaiveBayesClassifier, self).__init__()
  
  def train(self, weight, x, y):
    weighted_x=np.multiply(x.T, weight.T).T
    super(WeightedNaiveBayesClassifier, self).train(weighted_x, y)

In [0]:
wTrain, wTest, xTrain, xTest, yTrain, yTest = train_test_split(w, X, y, test_size = 0.2, random_state=42)
model = NaiveBayesClassfier()
model.train(xTrain, yTrain)
model_w = WeightedNaiveBayesClassifier()
model_w.train(wTrain, xTrain, yTrain)
print(accuracy((model.classifyNBWithAbsent(xTest))[0], yTest),accuracy((model.classifyNBWithoutAbsent(xTest))[0], yTest))
print(accuracy((model_w.classifyNBWithAbsent(xTest))[0], yTest),accuracy((model_w.classifyNBWithoutAbsent(xTest))[0], yTest))

0.8162162162162162 0.827027027027027
0.8135135135135135 0.8054054054054054


In [0]:
accAbs, accNoAbs, accwAbs, accwNoAbs=np.array([]), np.array([]), np.array([]), np.array([])
model = NaiveBayesClassfier()
model_w = WeightedNaiveBayesClassifier()
for i in range(10):
  xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
  model.train(xTrain, yTrain)
  accAbs = np.append(accAbs, accuracy((model.classifyNBWithAbsent(xTest))[0], yTest))
  accNoAbs = np.append(accNoAbs, accuracy((model.classifyNBWithoutAbsent(xTest))[0], yTest))
  model_w.train(wTrain, xTrain, yTrain)
  accwAbs = np.append(accwAbs, accuracy((model_w.classifyNBWithAbsent(xTest))[0], yTest))
  accwNoAbs = np.append(accwNoAbs, accuracy((model_w.classifyNBWithoutAbsent(xTest))[0], yTest))

print("Mean accuracy of unweighted model with absent words = {0}, without absent words = {1}".format(np.mean(accAbs), np.mean(accNoAbs)))
print("Mean accuracy of weighted model with absent words = {0}, without absent words = {1}".format(np.mean(accwAbs), np.mean(accwNoAbs)))

Mean accuracy of unweighted model with absent words = 0.8314864864864864, without absent words = 0.8318918918918919
Mean accuracy of weighted model with absent words = 0.8268918918918919, without absent words = 0.8297297297297297


I simply multipled the weight of each sample with the count in the sample as the new samples. So, a sample with a higher weight will be counted more than those have a smaller weight, so the purpose of weight, measure the importance of each sample, is implemented. However, in my trial, generally speaking, the model performance, both considering the absent words or ignore them, decreased after I applied the weight. My method may not successfully incorperated the weight with corresponding samples. But, the difference between these models are very small (~0.004), I cannot tell if the weight really matters.