<a href="https://colab.research.google.com/github/AmanPriyanshu/Natural-Language-Processing/blob/master/NaiveBayesForClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading Dataset:

In [1]:
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/sentiment140-subset.csv.zip -P data
!unzip -n -d data data/sentiment140-subset.csv.zip

File ‘data/sentiment140-subset.csv.zip’ already there; not retrieving.

Archive:  data/sentiment140-subset.csv.zip


## IMPORTS:

In [2]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from tqdm import tqdm
import string
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Only importing 30,000 values since this is more of practice and demo for refrence

In [3]:
df = pd.read_csv("data/sentiment140-subset.csv", nrows=30000)
print(df.head())

   polarity                                               text
0         0                      @kconsidder You never tweet  
1         0                 Sick today  coding from the couch.
2         1  @ChargerJenn Thx for answering so quick,I was ...
3         1  Wii fit says I've lost 10 pounds since last ti...
4         0  @MrKinetik Not a thing!!!  I don't really have...


### Let's count positives and negatives

In [4]:
df.polarity.value_counts()

1    15064
0    14936
Name: polarity, dtype: int64

In [5]:
df = df.values
print(df)

[[0 '@kconsidder You never tweet  ']
 [0 'Sick today  coding from the couch.']
 [1
  '@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that ']
 ...
 [1
  '@phnompenhpost thanks for the follow! u guys do a great job in reporting news about Cambodia...makes me proud to be cambodian ']
 [0
  "@coliwilso crapï¿½ I really wanted to make it for @minmï¿½ but I'm feeling way too tired after the whole weekend "]
 [1
  'follow friday- @theclassiccrime @jeremycamp @chris_daughtry &amp; @dannygokey ']]


In [6]:
polarity = df.T[0].flatten()
tweets = df.T[1].flatten()

In [7]:
tweets

array(['@kconsidder You never tweet  ',
       'Sick today  coding from the couch.',
       '@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that ',
       ...,
       '@phnompenhpost thanks for the follow! u guys do a great job in reporting news about Cambodia...makes me proud to be cambodian ',
       "@coliwilso crapï¿½ I really wanted to make it for @minmï¿½ but I'm feeling way too tired after the whole weekend ",
       'follow friday- @theclassiccrime @jeremycamp @chris_daughtry &amp; @dannygokey '],
      dtype=object)

## PREPROCESSING:

In [8]:
def stopwords_punctuation(arr):
  new_arr = []
  diction = {}
  for p in string.punctuation:
    diction.update({p:' '})
  for s in tqdm(arr):
    s = s.translate(str.maketrans(diction))
    new_arr.append(' '.join([i for i in s.split() if i not in stopwords.words('english')]))
  new_arr = np.array(new_arr)
  return new_arr

In [9]:
def stemming_lowercase(arr):
  porter = PorterStemmer()
  stemmed_arr = []
  for s in tqdm(arr):
    s = s.lower()
    stemmed_arr.append(' '.join([porter.stem(word) for word in s.split()]))
  stemmed_arr = np.array(stemmed_arr)
  return stemmed_arr

In [10]:
tweets = stopwords_punctuation(tweets)
tweets = stemming_lowercase(tweets)

100%|██████████| 30000/30000 [00:46<00:00, 650.61it/s]
100%|██████████| 30000/30000 [00:04<00:00, 6260.32it/s]


## TRAINING:

In [11]:
def word_freq(polarity_arr, str_arr):
  word_counts = {}
  total = [0, 0]
  logprior = [0, 0]
  for p, s in tqdm(zip(polarity_arr, str_arr), total=len(str_arr)):
    logprior[p] += 1
    s = s.split()
    for w in s:
      if w not in list(word_counts.keys()):
        word_counts.update({w:[1, 1]})
        total[0] += 1
        total[1] += 1
      word_counts[w][p] += 1
      total[p] += 1
  total = np.array(total)
  for w,f in word_counts.items():
    word_counts[w] = np.array(f)/total
  lambda_words = {} 
  for w,f in word_counts.items():
    lambda_words.update({w: np.log(f[1]/f[0])})
  logprior = np.array(logprior)
  logprior = logprior[1]/logprior[0]
  return lambda_words, logprior

In [12]:
lambda_words, logprior = word_freq(polarity[:int(0.5*len(tweets))], tweets[:int(0.5*len(tweets))])

100%|██████████| 15000/15000 [00:20<00:00, 719.69it/s]


## TESTING:

In [13]:
def testing(arr, lambda_words, logprior):
  y_pred = []
  for s in tqdm(arr):
    s = s.split()
    s = list(set(s))
    l = logprior
    for w in s:
      try:
        l += lambda_words[w]
      except:
        pass
    if l>0:
      y_pred.append(1)
    else:
      y_pred.append(0)
  y_pred = np.array(y_pred)
  return y_pred

In [14]:
y_pred = testing(tweets, lambda_words, logprior)
accuracy = 1 - np.mean(np.abs(y_pred - polarity))
print('\n\nAccuracy',accuracy)

100%|██████████| 30000/30000 [00:00<00:00, 156481.50it/s]



Accuracy 0.7925



