<a href="https://colab.research.google.com/github/FukudaBQ/Game2/blob/master/NLPTry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
TweetUrl='https://github.com/aasiaeet/cse5522data/raw/master/db3_final_clean.csv'
tweet_dataframe=pd.read_csv(TweetUrl)
# wordDict maps words to id
# X is the document-word matrix holding the presence/absence of words in each tweet
wordDict = {}
idCounter = 0
for i in range(tweet_dataframe.shape[0]):
  allWords = tweet_dataframe.iloc[i,1].split(" ")
  for word in allWords:
    if word not in wordDict:
      wordDict[word] = idCounter
      idCounter += 1
X = np.zeros((tweet_dataframe.shape[0], idCounter),dtype='float')
for i in range(tweet_dataframe.shape[0]):
  allWords = tweet_dataframe.iloc[i,1].split(" ")
  for word in allWords:
    X[i, wordDict[word]]  = 1
y = np.array(tweet_dataframe.iloc[:,2])
numNeg = np.sum(y<=0) #More robust and, IMHO, easier to understand
numPos = len(y) - numNeg
probNeg = numNeg / (numNeg + numPos)
probPos = 1 - probNeg
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)
display(xTrain.shape, xTest.shape, yTrain.shape, yTest.shape)
#Note: random_state=0 fixes the random seed so we get the same split every run. Don't use this below


(2957, 5989)

(740, 5989)

(2957,)

(740,)

abilities of words that are absent from a tweet LaTeX: P\left(x^i_j=0 | +\right) P ( x j i = 0 | + ) vs ignoring the absent words and only calculating values for words present in the tweet.

In [None]:
def compute_distros(x,y):
  #
  # probWordGivenPositive: P(word|Sentiment = +ive)
  probWordGivenPositive=np.sum(x[y>=0,:],axis=0) #Sum each word (column) to count how many times each word shows up (in positive examples)
  probWordGivenPositive=probWordGivenPositive/np.sum(y>=0) #Divide by total number of (positive) examples to give distribution

  # probWordGivenNegative: P(word|Sentiment = -ive)
  probWordGivenNegative=np.sum(x[y<0,:],axis=0)
  probWordGivenNegative=probWordGivenNegative/np.sum(y<0)

  # priorPositive: P(Sentiment = +ive)
  priorPositive = np.sum(y>=0)/y.shape[0] #Number of positive examples vs. all examples
  # priorNegative: P(Sentiment = -ive)
  priorNegative = 1 - priorPositive
  #  (note these last two form one distribution)

  return probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative

# compute distributions here
probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = compute_distros(xTrain,yTrain)

In [None]:
# compute the following:
# logProbWordPresentGivenPositive
# logProbWordAbsentGivenPositive
# logProbWordPresentGivenNegative
# logProbWordAbsentGivenNegative
# logPriorPositive
# logPriorNegative
def compute_logdistros(distros, min_prob):
  #Assume missing words are simply very rare
  #So, assign minimum probability to very small elements (e.g. 0 elements)
  distros=np.where(distros>=min_prob,distros,min_prob)
  #Also need to consider minimum probability for "not" distribution
  distros=np.where(distros<=(1-min_prob),distros,1-min_prob)

  #Note: Another option is to set the log for missing words to 0
  #      This is equivalent to simply ignoring the word (since logP==0 is the same as P==1)

  return np.log(distros), np.log(1-distros)

min_prob = 1/yTrain.shape[0] #Assume very rare words only appeared once
logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive = compute_logdistros(probWordGivenPositive,min_prob)
logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative = compute_logdistros(probWordGivenNegative,min_prob)
logPriorPositive, logPriorNegative = compute_logdistros(priorPositive,min_prob)


In the classifyNB function, variable result is the confidence of our prediction as the log of the ratio of posteriors:  log(P(predicted label|x(i))P(the other label|x(i)))

In [None]:
# classifyNB: 
#   words - vector of words of the tweet (binary vector)
#   logProbWordPresentGivenPositive - log P(x_j = 1|+)
#   logProbWordAbsentGivenPositive  - log P(x_j = 0|+)
#   logProbWordPresentGivenNegative - log P(x_j = 1|-)
#   logProbWordAbsentGivenNegative  - log P(x_j = 0|-)
#   logPriorPositive - log P(+)
#   logPriorNegative - log P(-)
#   returns (label of x according to the NB classification rule, confidence about the label)

# Note: you can also change the function definition if you wish to encapsulate all six log probs
# as one model; just make sure to follow through below

def classifyNB(words,logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
               logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
               logPriorPositive, logPriorNegative, ignore0=False):
  # add the log probabilities for all words appeared
  logProbPositiveGivenWords = logPriorPositive + np.sum(np.multiply(words,logProbWordPresentGivenPositive))
  logProbNegativeGivenWords = logPriorNegative + np.sum(np.multiply(words,logProbWordPresentGivenNegative))
  # cases when we do ignore the absent words
  if not ignore0:
    logProbPositiveGivenWords += np.sum(np.multiply(1-words,logProbWordAbsentGivenPositive))
    logProbNegativeGivenWords += np.sum(np.multiply(1-words,logProbWordAbsentGivenNegative))

  label = 1 
  result = logProbPositiveGivenWords - logProbNegativeGivenWords
  if result < 0:
    label = -1
    result = -result

  return label, result

In [None]:
# testNB: Classify all xTest
#   xTest - test data features
#   yTest - true label of test data
#   logProbWordPresentGivenPositive - log P(x_j = 1|+)
#   logProbWordAbsentGivenPositive  - log P(x_j = 0|+)
#   logProbWordPresentGivenNegative - log P(x_j = 1|-)
#   logProbWordAbsentGivenNegative  - log P(x_j = 0|-)
#   logPriorPositive - log P(+)
#   logPriorNegative - log P(-)
#   returns Average test error
def testNB(xTest, yTest, 
           logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
           logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
           logPriorPositive, logPriorNegative, ignore0=False):
  correct=0
  for i in range(yTest.shape[0]):
    label, confidence = classifyNB(xTest[i, ],logProbWordPresentGivenPositive,logProbWordAbsentGivenPositive,logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative,
logPriorPositive, logPriorNegative,ignore0=ignore0)
    if label == yTest[i]:
      correct+=1
  accuracy = correct/yTest.shape[0]
  print("The Naive Bayes Model's accuracy is : ", accuracy)
  return accuracy

testNB(xTest, yTest, 
       logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
       logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
       logPriorPositive, logPriorNegative,ignore0=True)

The Naive Bayes Model's accuracy is :  0.8283783783783784


0.8283783783783784

In [None]:
def repeat(count,X,y,ignore0=False):
  result=[]
  for i in range(count):
    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
    probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = compute_distros(xTrain,yTrain)
    min_prob = 1/yTrain.shape[0]
    logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive = compute_logdistros(probWordGivenPositive,min_prob)
    logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative = compute_logdistros(probWordGivenNegative,min_prob)
    logPriorPositive, logPriorNegative = compute_logdistros(priorPositive,min_prob)
    accuracy = testNB(xTest,yTest,logProbWordPresentGivenPositive,logProbWordAbsentGivenPositive,logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative,
                      logPriorPositive, logPriorNegative, ignore0=ignore0)
    result.append(accuracy)

  result=np.array(result)
  mean = np.mean(result)
  std = np.std(result)
  print("Average accuracy is :",mean,"\nAverage standard deviation is: ",std)
print("The results of incorporating the probabilities of words that are absent:")
repeat(10,X,y)
print("The results of ingoring absent words: ")
repeat(10,X,y,ignore0=True)
    

The results of incorporating the probabilities of words that are absent:
The Naive Bayes Model's accuracy is :  0.8027027027027027
The Naive Bayes Model's accuracy is :  0.85
The Naive Bayes Model's accuracy is :  0.8337837837837838
The Naive Bayes Model's accuracy is :  0.8243243243243243
The Naive Bayes Model's accuracy is :  0.8364864864864865
The Naive Bayes Model's accuracy is :  0.8418918918918918
The Naive Bayes Model's accuracy is :  0.8351351351351352
The Naive Bayes Model's accuracy is :  0.8243243243243243
The Naive Bayes Model's accuracy is :  0.8189189189189189
The Naive Bayes Model's accuracy is :  0.831081081081081
Average accuracy is : 0.829864864864865 
Average standard deviation is:  0.012480079086112273
The results of ingoring absent words: 
The Naive Bayes Model's accuracy is :  0.8216216216216217
The Naive Bayes Model's accuracy is :  0.822972972972973
The Naive Bayes Model's accuracy is :  0.8351351351351352
The Naive Bayes Model's accuracy is :  0.827027027027027

Comparing the result from counting absent words to ignoring absent words, we could find that the accuracies between these two approachs do not change a lot. Though, the average accuracy of counting absent words is slightly higher than that of ignoring absent words.(Sometimes the latter exceeds the former. As a result, whether to ignore the absence of a word do not significantly impact the result)

2. Adding weight to compare the result

In [None]:
def repeat(count,X,y,weights,ignore0=False):
  result=[]
  for i in range(count):
    xTrain, xTest, yTrain, yTest, wTrain, wTest = train_test_split(X, y, weights, test_size = 0.2)
    probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = compute_distros(xTrain,yTrain,weights=wTrain)
    min_prob = 1/yTrain.shape[0]
    logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive = compute_logdistros(probWordGivenPositive,min_prob)
    logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative = compute_logdistros(probWordGivenNegative,min_prob)
    logPriorPositive, logPriorNegative = compute_logdistros(priorPositive,min_prob)
    accuracy = testNB(xTest,yTest,logProbWordPresentGivenPositive,logProbWordAbsentGivenPositive,logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative,
                      logPriorPositive, logPriorNegative, ignore0=ignore0)
    result.append(accuracy)

  result=np.array(result)
  mean = np.mean(result)
  std = np.std(result)
  print("Average accuracy is :",mean,"\nAverage standard deviation is: ",std)
    

In [None]:
#get the weight
weights = np.array(tweet_dataframe.iloc[:,0])

def compute_distros(x,y,weights):
  #incorporating the weight with the sample
  x=np.multiply(x,weights.reshape((-1,1)))
  y=np.multiply(y,weights)
  probWordGivenPositive=np.sum(x[y>=0,:],axis=0)
  probWordGivenPositive=probWordGivenPositive/np.sum(y[y>=0])

  probWordGivenNegative=np.sum(x[y<0,:],axis=0)
  probWordGivenNegative=probWordGivenNegative/-np.sum(y[y<0])

  priorPositive = np.sum(y[y>=0])/np.sum(np.abs(y))
  priorNegative = 1 - priorPositive

  return probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative

repeat(10,X,y,weights=weights)

The Naive Bayes Model's accuracy is :  0.8445945945945946
The Naive Bayes Model's accuracy is :  0.7986486486486486
The Naive Bayes Model's accuracy is :  0.8189189189189189
The Naive Bayes Model's accuracy is :  0.831081081081081
The Naive Bayes Model's accuracy is :  0.8243243243243243
The Naive Bayes Model's accuracy is :  0.8364864864864865
The Naive Bayes Model's accuracy is :  0.8378378378378378
The Naive Bayes Model's accuracy is :  0.8405405405405405
The Naive Bayes Model's accuracy is :  0.831081081081081
The Naive Bayes Model's accuracy is :  0.8108108108108109
Average accuracy is : 0.8274324324324324 
Average standard deviation is:  0.013675381900487399


Comparing results from previous attempts to this one, namely adding weight to calculate the probability, the accuracy doesn't change a lot. 
This might because that the appearance or absence of some words might have more direct impact on the classification.

find top sticky pairs(Not finished. Doesn't count)

In [None]:
print(wordDict)
#tweet_dataframe.shape[0]
pairs = dict()
rank = []
for i in range(tweet_dataframe.shape[0]):
  allWords = tweet_dataframe.iloc[i,1].split(" ")
  for j in range(len(allWords)-1):
    pair = (wordDict[allWords[j]],wordDict[allWords[j+1]])
    #pair = (allWords[j],allWords[j+1])
    if pair in pairs:
      pairs[pair] +=1
    else:    
      pairs[pair] = 1
for k,d in pairs.items():
  rank.append(pairs[k])
  if pairs[k]>200:
    print(k,d)
#print(sorted(rank)[-50:-1])
print(np.sum(X[:,10]))


  

(56, 9) 240
(13, 56) 252
(42, 56) 327
(144, 42) 203
(56, 1) 250
1.0
