# Preprocessing and Polarity Analysis

On this notebook is developed the preprocessing and the application of polarity analysis to film reviews.

In [1]:
import numpy as np
import pandas as pd
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs

### Extraction of corpus

In [2]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [6]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [7]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [8]:
X, y = extract_lines('./../corpusCriticasCine/corpusCriticasCine/')

In [9]:
len(X)

3878

In [10]:
len(y)

3878

In [11]:
tokenized_X = tokenize_lines_by_words(X)

In [12]:
new_X = clean_alphabetic_text_lines(tokenized_X)

In [13]:
clean_X = remove_stop_words(new_X)

In [14]:
data = list(zip(clean_X, y))

In [15]:
random.shuffle(data)

In [16]:
clean_X, y = zip(*data)

In [17]:
clean_X

('eliroth apadrinar nuevamente personaje importante caso inefable quentintarantino insistir llamar atención crear polémica decualquierforma posible atraer espectador visionar obra haber ofrecer práctico soler ser acercar gente renombre llamar atención intentar sembrar pánico ambos caso dar persona eliroth debut cinematográfico cabinfever amparar figura davidlynch ser punto favor observar nombre genio lynch implicar alguno forma proyecto lanzar visionar aunque simplemente ser mero curiosidad ser gran baza cierto ser eliroth necesitar poderoso campaña publicitario acompañamiento film pasar desapercibido quedar absoluto olvido producir controvertir director crear revuelo relativo terrible polémica temático película pregonar continente ser duda forma efectivo lograr interesar propósito ser joven director ser fan grotesco seguidor acérrimo culto cercano día hoy denominar friki perseguidor fama periódico revista mundial querer hacerver mundo cine mundo u forma atraer captar fan alimentar ego

In [18]:
y

(2,
 4,
 1,
 4,
 4,
 2,
 3,
 2,
 5,
 5,
 2,
 3,
 4,
 4,
 5,
 2,
 2,
 3,
 3,
 5,
 2,
 2,
 4,
 4,
 2,
 4,
 5,
 4,
 3,
 3,
 5,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 4,
 3,
 4,
 5,
 2,
 3,
 1,
 3,
 3,
 3,
 3,
 2,
 5,
 1,
 1,
 3,
 3,
 3,
 1,
 2,
 2,
 4,
 2,
 2,
 1,
 2,
 3,
 3,
 3,
 1,
 4,
 4,
 5,
 1,
 2,
 4,
 3,
 3,
 4,
 5,
 4,
 1,
 3,
 3,
 1,
 2,
 2,
 1,
 3,
 2,
 4,
 2,
 2,
 4,
 3,
 4,
 3,
 5,
 3,
 3,
 1,
 5,
 5,
 5,
 5,
 3,
 4,
 4,
 4,
 2,
 3,
 2,
 3,
 4,
 2,
 4,
 4,
 2,
 5,
 2,
 4,
 2,
 3,
 4,
 5,
 4,
 1,
 2,
 2,
 4,
 1,
 1,
 1,
 4,
 5,
 3,
 2,
 2,
 3,
 3,
 3,
 1,
 5,
 4,
 5,
 2,
 4,
 2,
 2,
 3,
 2,
 3,
 4,
 5,
 1,
 1,
 3,
 5,
 4,
 4,
 2,
 4,
 5,
 3,
 5,
 3,
 3,
 3,
 2,
 3,
 2,
 3,
 4,
 4,
 4,
 4,
 5,
 5,
 3,
 3,
 4,
 1,
 5,
 2,
 3,
 4,
 1,
 3,
 2,
 3,
 3,
 1,
 2,
 2,
 4,
 3,
 2,
 3,
 2,
 2,
 3,
 3,
 2,
 4,
 1,
 1,
 2,
 4,
 3,
 2,
 2,
 2,
 5,
 2,
 1,
 1,
 2,
 3,
 4,
 3,
 4,
 3,
 1,
 2,
 4,
 3,
 2,
 4,
 2,
 3,
 4,
 5,
 4,
 3,
 2,
 2,
 3,
 4,
 4,
 3,
 3,
 2,
 5,
 3,
 5,
 4,
 3,
 4,


In [19]:
set(y)

{1, 2, 3, 4, 5}

### Extracting of dictionary 1

In [45]:
dict1 = dict()
with open('./../Sentiment_dictionaries/Senticon_es/senticon.es.xml', 'r', encoding = 'utf-8') as rfile:
    content = rfile.readlines()
    content = "".join(content)
    bs_content = bs(content, "lxml")
    lemmas = bs_content.find_all("lemma")
    token = [i.get_text()[1:-1] for i in lemmas]
    pols = [float(i.get('pol')) for i in lemmas]
    for i in range(len(token)):
        dict1[token[i]] = pols[i] 

### Extracting of dictionary 2

In [47]:
dict2 = dict()
with open('./../Sentiment_dictionaries/Spanish_sentiment_lexicon/fullStrengthLexicon.txt', 'r', encoding = 'utf-8') as rfile:
    content = rfile.readlines()
    token = list()
    pols = list()

    for line in content:
        if line != '\n':
            line_ls = line.split()
            token.append(line_ls[0])
            pols.append(line_ls[2])
    
    for i in range(len(token)):
        dict2[token[i]] = pols[i] 

### Grouping text by classes

In [55]:
dict_class_reviews = dict()
ls_class_1 = list()
ls_class_2 = list()
ls_class_3 = list()
ls_class_4 = list()
ls_class_5 = list()
for i in range(len(clean_X)):
    if y[i] == 1:
        ls_class_1.append(clean_X[i])
    elif y[i] == 2:
        ls_class_2.append(clean_X[i])
    elif y[i] == 3:
        ls_class_3.append(clean_X[i])
    elif y[i] == 4:
        ls_class_4.append(clean_X[i])
    else:
        ls_class_5.append(clean_X[i])
        
dict_class_reviews[1] = ls_class_1
dict_class_reviews[2] = ls_class_2
dict_class_reviews[3] = ls_class_3
dict_class_reviews[4] = ls_class_4
dict_class_reviews[5] = ls_class_5

### Analysis with dictionary 1

In [74]:
avg_pols_rev = list()
for i in range(5):
    ls_class = dict_class_reviews[i + 1]
    sum_pols_rev = list()
    for rev in ls_class:
        rev = rev.split()
        pols_rev = list()
        for w in rev:
            try:
                pols_rev.append(dict1[w])
            except:
                pass
        pols_rev = np.array(pols_rev)
        sum_pols_rev.append(sum(pols_rev))

    sum_pols_rev = np.array(sum_pols_rev)
    avg_pols_rev.append(sum(sum_pols_rev) / len(ls_class))

In [75]:
avg_pols_rev

[4.100672364672364,
 5.632869989165757,
 7.114497206703916,
 7.785140449438212,
 8.381260303687636]

In [78]:
res_dict1 = [[i + 1, avg_pols_rev[i]] for i in range(len(avg_pols_rev))]

In [79]:
print(tabulate(res_dict1, headers = ['Classes', 'Average Polarity'],
                   tablefmt = 'fancy_grid', numalign = 'center'))

╒═══════════╤════════════════════╕
│  Classes  │  Average Polarity  │
╞═══════════╪════════════════════╡
│     1     │      4.10067       │
├───────────┼────────────────────┤
│     2     │      5.63287       │
├───────────┼────────────────────┤
│     3     │       7.1145       │
├───────────┼────────────────────┤
│     4     │      7.78514       │
├───────────┼────────────────────┤
│     5     │      8.38126       │
╘═══════════╧════════════════════╛


### Analysis with dictionary 2

In [69]:
percentages = list()

for i in range(5):
    ls_class = dict_class_reviews[i + 1]
    sum_pols_rev = list()
    pols_rev = list()
    for rev in ls_class:
        rev = rev.split()
        pos_counter = 0
        neg_counter = 0
        for w in rev:
            try:
                pol = dict2[w]
                if pol == 'pos':
                    pos_counter += 1
                else:
                    neg_counter += 1
            except:
                pass
        
        if pos_counter > neg_counter:
            pols_rev.append('pos')
        elif pos_counter == neg_counter:
            pols_rev.append('neutral')
        else:
            pols_rev.append('neg')

    pos_counter_class = 0
    neutral_counter_class = 0
    neg_counter_class = 0
    for pol in pols_rev:
        if pol == 'pos':
            pos_counter_class += 1
        elif pol == 'neutral':
            neutral_counter_class += 1
        else:
            neg_counter_class += 1
    
    percentages.append([str(round(100 * pos_counter_class / len(pols_rev), 2)) + '%',
                        str(round(100 * neutral_counter_class / len(pols_rev), 2)) + '%',
                        str(round(100 * neg_counter_class / len(pols_rev), 2)) + '%'])

In [70]:
percentages

[['40.46%', '8.55%', '51.0%'],
 ['56.77%', '7.15%', '36.08%'],
 ['65.2%', '7.74%', '27.06%'],
 ['70.34%', '4.83%', '24.83%'],
 ['67.03%', '5.64%', '27.33%']]

In [80]:
res_dict2 = [[i + 1, percentages[i][0], percentages[i][1], percentages[i][2]] for i in range(len(percentages))]

In [81]:
print(tabulate(res_dict2, headers = ['Classes', '% Positive Polarity', '% Neutral Polarity', '% Negative Polarity'],
                   tablefmt = 'fancy_grid', numalign = 'center'))

╒═══════════╤═══════════════════════╤══════════════════════╤═══════════════════════╕
│  Classes  │ % Positive Polarity   │ % Neutral Polarity   │ % Negative Polarity   │
╞═══════════╪═══════════════════════╪══════════════════════╪═══════════════════════╡
│     1     │ 40.46%                │ 8.55%                │ 51.0%                 │
├───────────┼───────────────────────┼──────────────────────┼───────────────────────┤
│     2     │ 56.77%                │ 7.15%                │ 36.08%                │
├───────────┼───────────────────────┼──────────────────────┼───────────────────────┤
│     3     │ 65.2%                 │ 7.74%                │ 27.06%                │
├───────────┼───────────────────────┼──────────────────────┼───────────────────────┤
│     4     │ 70.34%                │ 4.83%                │ 24.83%                │
├───────────┼───────────────────────┼──────────────────────┼───────────────────────┤
│     5     │ 67.03%                │ 5.64%                │ 27.3