# Pos Neg Tagger

## Imports

In [1]:
import re
import spacy
import pandas as pd
import numpy as np
import json
import random as rand

nlp = spacy.load("en_core_web_md")   # Load language model

## Stage 1 - Data Cleaning

In [2]:
pos_list, neg_list = None, None

with open('Datasets/Raw/rt-polarity.pos', 'r') as pos_reader:
    pos_list = pos_reader.readlines()
    
with open('Datasets/Raw/rt-polarity.neg', 'r') as neg_reader:
    neg_list = neg_reader.readlines()

### Create DataFrames

In [3]:
pos_df = pd.DataFrame(data = {'raw_sentence': pos_list, 'tag': ['positive' for _ in pos_list]})
neg_df = pd.DataFrame(data = {'raw_sentence': neg_list, 'tag': ['negative' for _ in neg_list]})

# Combine pos_df & neg_df
comments_df = pos_df.append(neg_df)
comments_df

Unnamed: 0,raw_sentence,tag
0,the rock is destined to be the 21st century's ...,positive
1,"the gorgeously elaborate continuation of "" the...",positive
2,effective but too-tepid biopic\n,positive
3,if you sometimes like to go to the movies to h...,positive
4,"emerges as something rare , an issue movie tha...",positive
...,...,...
5326,a terrible movie that some people will neverth...,negative
5327,there are many definitions of 'time waster' bu...,negative
5328,"as it stands , crocodile hunter has the hurrie...",negative
5329,the thing looks like a made-for-home-video qui...,negative


### Clean Data

In [4]:
def clean_sentence(text):

    # Remove leading whitespaces, then convert to Spacy Doc object
    doc = nlp(text.strip())
    
    # Remove punctuations & lemmatize
    stopwords = nlp.Defaults.stop_words
    text = ' '.join([token.lemma_ for token in doc if not token.is_punct])
    
    # Manual mapping
    text = text.replace('n\'t', 'not')

    # Remove other punctuations
    return re.sub('\'s', '', text)

# Clean sentences
comments_df['cleaned_sentence'] = comments_df['raw_sentence'].apply(clean_sentence)

In [5]:
comments_df.to_csv('Datasets/Processed/comments_df.csv', index = False)

## Stage 2 - Unigrams & Bigrams

In [6]:
comments_df = pd.read_csv('Datasets/Processed/comments_df.csv')
comments_df

Unnamed: 0,raw_sentence,tag,cleaned_sentence
0,the rock is destined to be the 21st century's ...,positive,the rock be destine to be the 21st century ne...
1,"the gorgeously elaborate continuation of "" the...",positive,the gorgeously elaborate continuation of the l...
2,effective but too-tepid biopic\n,positive,effective but too tepid biopic
3,if you sometimes like to go to the movies to h...,positive,if you sometimes like to go to the movie to ha...
4,"emerges as something rare , an issue movie tha...",positive,emerge as something rare an issue movie that b...
...,...,...,...
10657,a terrible movie that some people will neverth...,negative,a terrible movie that some people will neverth...
10658,there are many definitions of 'time waster' bu...,negative,there be many definition of time waster but th...
10659,"as it stands , crocodile hunter has the hurrie...",negative,as it stand crocodile hunter have the hurry ba...
10660,the thing looks like a made-for-home-video qui...,negative,the thing look like a make for home video quickie


### Word Counts & Bigrams

In [7]:
pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams = [], [], [], []

for index, row in comments_df.iterrows():
    
    comment = row['cleaned_sentence']
    tag = row['tag']
    
    if pd.isnull(comment):
        break
    
    # Unigrams
    if tag == 'positive':
        pos_unigrams += comment.split(' ')
    else:
        neg_unigrams += comment.split(' ')
    
    # bigrams
    if tag == 'positive':
        pos_bigrams += [' '.join(bigram) for bigram in zip(comment.split(" ")[:-1], comment.split(" ")[1:])]
    else:
        neg_bigrams += [' '.join(bigram) for bigram in zip(comment.split(" ")[:-1], comment.split(" ")[1:])]

In [8]:
# Get count and store as dictionary
pos_unigrams_dict = pd.Series(pos_unigrams)[10:-10].value_counts().to_dict()
neg_unigrams_dict = pd.Series(neg_unigrams)[10:-10].value_counts().to_dict()

pos_bigrams_dict = pd.Series(pos_bigrams)[10:-10].value_counts().to_dict()
neg_bigrams_dict = pd.Series(neg_bigrams)[10:-10].value_counts().to_dict()

# Save JSONs
with open('Datasets/Processed/vocab.json', 'w') as write:
    output = {
        'pos_unigrams_dict':pos_unigrams_dict,
        'neg_unigrams_dict': neg_unigrams_dict,
        'pos_bigrams_dict': pos_bigrams_dict,
        'neg_bigrams_dict': neg_bigrams_dict
    }
    
    json.dump(output, write)

## Stage 3 - Test & Debug

### Calculate probabilities

In [9]:
# Load JSONs
with open('Datasets/Processed/vocab.json', 'r') as read:
    vocab = json.load(read)

pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams = [vocab[key] for key in vocab.keys()]

In [10]:
def unigram_prob(word, unigrams):
    ''' Calculates probability of the unigram '''
    
    # Just in case
    if word not in unigrams:
        return 0
        
    word_count = unigrams[word]
    unigram_size = len(unigrams)
    return word_count / unigram_size * 10

In [11]:
def bigram_prob(first_word, second_word, alpha, beta, gamma, coefficient, unigrams, bigrams):
    ''' Calculates probability of the bigram '''
    
    bigram = ' '.join([first_word, second_word])
    
    # Alpha prob
    if bigram not in bigrams:
        alpha_prob = 0
    else:
        bigram_count = bigrams[bigram]
        alpha_prob = alpha * (bigram_count / unigrams[first_word]) * 10
    
    # Beta prob
    beta_prob = beta * unigram_prob(second_word, unigrams)
    
    # coefficient
    coef = gamma * coefficient * 10
    
    return alpha_prob + beta_prob + coef

In [12]:
def sentence_prob(sentence, alpha, beta, gamma, coefficient, unigrams, bigrams):
    ''' Calculates probability of the sentence '''
    
    # Clean & get sentence tokens
    cleaned_sentence = clean_sentence(sentence)
    words = cleaned_sentence.split(' ')
    
    # Calculate the first unigram prob
    probability = unigram_prob(words[0], unigrams)
    
    bigrams = [bigram for bigram in zip(cleaned_sentence.split(" ")[:-1], cleaned_sentence.split(" ")[1:])]
    for bigram in bigrams:
        probability *= bigram_prob(bigram[0], bigram[1], alpha, beta, gamma, coefficient, unigrams, bigrams)
        
    return round(probability, 10)

In [13]:
def get_sentiment(sentence, alpha, beta, gamma, coefficient, pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams):
    ''' Retusn sentiment of the sentence '''
    
    pos_prob = sentence_prob(sentence, alpha, beta, gamma, coefficient, pos_unigrams, pos_bigrams)
    neg_prob = sentence_prob(sentence, alpha, beta, gamma, coefficient, neg_unigrams, neg_bigrams)
    
    sentiment = 'POSITIVE' if (pos_prob > neg_prob) else 'NEGATIVE'
    
    return sentiment, pos_prob, neg_prob

In [14]:
while True:
    
    comment = input()
    
    if comment == '!q':
        break
    
    sentiment = get_sentiment(comment, 0.4, 0.3, 0.3, 0.5, pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams)
    print(sentiment)

 !q


In [22]:
test_case = [
    'loved the movie',                                        # POSITIVE
    'Terrible movie, totally hate it.',                       # NEGATIVE
    'What a waste of time watching this terible movie',       # NEGATIVE    
    'It was fun watching the movie. I was amazing for kids',  # POSITIVE
    'Good movie I really like it',                            # POSITIVE
    'Terrible plot, come one man',                            # NEGATIVE
    'good actors, well played.',                              # POSITIVE    
]

for test in test_case:
    sentiment, pos_prob, neg_prob = get_sentiment(test, 0.5, 0.3, 0.2, 0.29, pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams)
    print('+' if sentiment == 'POSITIVE' else '-', f'{sentiment} | POS: {pos_prob}\t NEG: {neg_prob}')

+ POSITIVE | POS: 0.264763386	 NEG: 0.1385596368
- NEGATIVE | POS: 0.0020942766	 NEG: 0.004196811
+ POSITIVE | POS: 0.0484180103	 NEG: 0.0391477328
- NEGATIVE | POS: 0.4519219794	 NEG: 0.5773533558
+ POSITIVE | POS: 0.0782496687	 NEG: 0.0629112068
- NEGATIVE | POS: 0.0011839071	 NEG: 0.0022096209
+ POSITIVE | POS: 0.0865648545	 NEG: 0.0557123813
+ POSITIVE | POS: 0.153699358	 NEG: 0.1493484698


## Finding best parameters

In [92]:
# tune_test_case = [
#     'I like the movie',
# ]

# while True:
    
#     r = [rand.random() for i in range(3)]
#     s = sum(r)
#     alpha, beta, gamma = [ i/s for i in r]
    
#     coeffitient = rand.random()
    
#     sentiment, pos_prob, neg_prob = get_sentiment(tune_test_case[0], alpha, beta, gamma, coeffitient, pos_unigrams, neg_unigrams, pos_bigrams, neg_bigrams)
    
#     if sentiment == 'POSITIVE':
#         print('Best Parametes:', alpha, beta, gamma, coeffitient)
#         break