# Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

import re
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import math
from operator import itemgetter

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

from utils import *
from path import path

In [None]:
train_data = pd.read_csv(path + 'data_updated\\train.csv', index_col=-1)
test_data = pd.read_csv(path + 'data_updated\\test.csv', index_col=-1)

In [None]:
train = train_data.copy()

In [None]:
# Positive sentiments - 1
# Negative sentiments - 0

# Take a look on data

In [None]:
train

# Lyrics

In [None]:
lyrics = train['lyrics'].copy()
lyrics

In [None]:
# missing values
train[train['lyrics'].isna()]

In [None]:
# já feito no "project"

# duplicates
train[train['lyrics'].duplicated()]

In [None]:
# já feito no "project"

display(train[train.duplicated(['year', 'lyrics'])]) #85
display(train[train.duplicated(['views', 'lyrics'])]) #1

In [None]:
# same year, 
# same title, 
# same lyrics
# different artist

train[train['title'] == "Honeysuckle Rose"]

In [None]:
# train[train.duplicated(['title', 'artist', 'features', 'lyrics'])] #0
# train[train.duplicated(['title', 'artist', 'lyrics'])] #0
display(train[train.duplicated(['title', 'lyrics'])]) #24 --- covers
display(train[train.duplicated(['artist', 'lyrics'])]) #111 --- labeled versions ("acoustic", "remix", "extended")

# Strange Values

In [None]:
train['year'].unique()

In [None]:
train[train['year'] < 1000]

# Text Preprocessing

In [None]:
# Text cleaning
    # Handle Email adresses
    # Remove HTML tags
    # Word normalization
    # split into sentences

# Feature Extraction - Encode Text into Numbers
    # Vectorization
        # Freq vectors
        # One hot
    # BOW -  calculate the frequency of words for each document
        # 1. set of all words found in the document se
        # 2. Count how many times each word appears for each document
    # TF-IDF

In [None]:
def sub_remove_2(x):

    '''
    diferença entre esta e a sub_remove:
    - tirei a parte que tirava os emojis - [^0-9A-Za-z]
    - adicionei retirar emails e html tags
     '''
    
    # Remove noise
    x = re.sub(r"(@[A-Za-z0-9]+)|(\w+:\/\/\S+)|^rt|http.+?", "", x, flags=re.MULTILINE)
    
    # Replace newline and tab characters with spaces
    x = re.sub(r'[\t\n]', ' ', x)

    # Remove html tags
    x = re.sub(re.compile('<.*?>'), '', x)

    # Remove email addresses
    x = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', x)

    # Remove isolated consonants:
    x = re.sub(r'\b([^aeiou])\b',' ',x)

    # Remove space before punctuation
    x = re.sub(r'(\s)(?!\w)','',x)

    return x

In [None]:
def sentiment_preprocesser(data, text_column, target=None):

    #### deixei as stopwords

    text_data = data[text_column].copy()
    
    functions = [lambda x: x.lower(), 
                    expand_contractions, 
                    sub_remove_2, 
                    sub_spaces
                ]
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           "]+", flags=re.UNICODE)
    
    for function in functions:
        text_data = text_data.apply(function)

    if target is not None:

        regexp = RegexpTokenizer(r'\w+|' + emoji_pattern.pattern) ### adidiona emojis como tokens
        text_data = text_data.apply(regexp.tokenize)
           
    #     words = [word for tokens in text_data for word in tokens]
    #     words_unique = list(set(words))
    #     words_tagged = pos_tag(words_unique)
    #     words_pos_map = {word: get_wordnet_pos(pos_tag) for word, pos_tag in words_tagged}

    #     lemmatizer = WordNetLemmatizer()
    #     text_data = [
    #          [lemmatizer.lemmatize(word, pos=words_pos_map.get(word)) for word in sentence]
    #          for sentence in text_data
    #          ]
        
    #     stopwords = nltk.corpus.stopwords.words('english')

    #     additional_functions = [lemmatize_with_mapping,
    #                   lambda x: [item for item in x if item not in stopwords],
    #                   lambda x: ' '.join(x)
    #                   ]
    
    #     for additional_function in additional_functions:
    #         text_data = text_data.apply(additional_function, args=(words_pos_map,) 
    #                                     if additional_function == lemmatize_with_mapping else ())

    # # if target is not None:
    #     text_data = pd.DataFrame(text_data, columns=[text_column])
    #     text_data[target] = data[target]
    
    return text_data

In [None]:
lyrics_preproc  = sentiment_preprocesser(train, 'lyrics', 'tag')

In [None]:
words = [word for tokens in lyrics_preproc for word in tokens]
print(words)
words_unique = list(set(words))
print(words_unique)
words_tagged = pos_tag(words_unique)
print(words_tagged)
words_pos_map = {word: get_wordnet_pos(pos_tag) for word, pos_tag in words_tagged}
print(words_pos_map)

# Feature Extraction

In [None]:
# Bag-of_words(BoW) Model
#     create a dictionary of all the words used in the corpus
#     convert each document to a vector that represents words available in the documents
#     identify importance of words:
#         Count Vector Model

#         Term Frequency Vector Model

#         Term Frequency-Inverse Document Frequency(TF-IDF) Model

# creating count vectors for the dataset

# Displaying Document Vectors

# Removing Low-Frequency Words

# Removing Stop Words

# Distribution of words Across Different sentiment

# Algorithms 

In [None]:
# Algorithms
#     Rule-Based 
#         Based on a set of manually crafted rules
#         can't learn or adapt beyond what they were initially programmed for
#         can't easily change them or add new rules
#         can't think for themselves or make decisions outside of those rules

#         1. Construct explicit rules and patterns
#         2. use lexicons - dictionary-based systems that rely on lists of words or phrases with associated sentiment scores
#             VADER
#             TEXTBLOB
#             AFINN Lexicon
#             SentiWordNet
#             Bing Liu’s lexicon


#     Automatic
#         ML algorithms (SVM, NN, NB, …)
#     Hybrid
#         Rule-Based + Automatic

In [None]:
##### EMOJIS
# VADER performs very well with emojis, slangs and acronyms in sentences

import re

# Unicode ranges for emojis
emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF]", flags=re.UNICODE)

for i in lyrics:
    if emoji_pattern.search(i):
        # print(i)
        print(emoji_pattern.findall(i))
        
#####  apagar os not related com emoções e deixar os outrs???

In [None]:
### tag
tag = train['tag'].copy()

# functions = [lambda x: x.lower(), 
#                 expand_contractions, 
#                 sub_remove, 
#                 sub_spaces]

# for function in functions:
#     text_data = text_data.apply(function)

# if target is not None:
#     regexp = RegexpTokenizer('\w+')
#     text_data = text_data.apply(regexp.tokenize)

#     words = [word for tokens in text_data for word in tokens]
#     words_unique = list(set(words))
#     words_tagged = pos_tag(words_unique)
#     words_pos_map = {word: get_wordnet_pos(pos_tag) for word, pos_tag in words_tagged}

#     # lemmatizer = WordNetLemmatizer()
#     # text_data = [
#     #     [lemmatizer.lemmatize(word, pos=words_pos_map.get(word)) for word in sentence]
#     #     for sentence in text_data
#     #     ]
    
#     # stopwords = nltk.corpus.stopwords.words('english')

#     additional_functions = [lemmatize_with_mapping,
#                 #   lambda x: [item for item in x if item not in stopwords],
#                     lambda x: ' '.join(x)
#                     ]

#     for additional_function in additional_functions:
#         text_data = text_data.apply(additional_function, args=(words_pos_map,) 
#                                     if additional_function == lemmatize_with_mapping else ())

# # if target is not None:
#     text_data = pd.DataFrame(text_data, columns=[text_column])
#     text_data[target] = data[target]

In [None]:
# lyrics_train = pd.read_csv(r'C:\Users\bruna\Desktop\data_updated\lyrics_train.csv', index_col=-1)

In [None]:
lyrics_train

In [None]:
# Simplest task -> positive or negative
# More complex -> rank the attitude of this text from 1 to 5
# Advanced -> detect the target, source, or complex attitude type

In [None]:
# ### VADER
# {'compound': 0.6588, 'neg': 0.0, 'neu': 0.406, 'pos': 0.594}
# [-1] (Extremely Negative)
# [1] (Extremely Positive)
# [0] Neutral or Neither

In [None]:
# TEXTBLOB
# `Sentiment(polarity=1.0, subjectivity=0.75)`

# polarity → measures the sentiment or emotional tone of the text

# - ranges between [-1, 1]
#     - 1 indicates a highly negative sentiment
#     - 0 indicates a neutral sentiment
#     - 1 indicates a highly positive sentiment
    

# subjectivity → measures how objective or subjective the text is

# - ranges between [0, 1]
#     - 0 indicates a highly objective piece of text → fact-based content
#     - 1 indicates a highly subjective (opinionated) piece of text → personal opinions, emotions, judgements

In [None]:
# Challenges:
#     phrase with negation
#     negation, inverted word order - Disliking horror movies is not uncommon
#     The adverb sometimes modifies the sentiment - ex: Somentimes
#     sarcasm
#     negative term used in a positive way
#     difficult to categorize

#     Objective / Subjective
#     Context and Polarity

In [None]:
# Context-Dependent Erros:
#     Sarcasm
#     Polarity
#     Polysemy
#     Emojis -  emojis sometimes cannot be classified accurately and thus are removed from many analysis
#         (If those are removed from text, one ends up with a noncomprehensive analysis)
#     gender stereotypes