# Preprocessing, Classification and Clustering

On this notebook is developed the preprocessing and the application of classification and clustering algorithms to film reviews.

In [1]:
import numpy as np
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs

In [2]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [6]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [7]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [8]:
X, y = extract_lines('./../corpusCriticasCine/corpusCriticasCine/')

In [9]:
len(X)

3878

In [10]:
len(y)

3878

In [11]:
tokenized_X = tokenize_lines_by_words(X)

In [12]:
new_X = clean_alphabetic_text_lines(tokenized_X)

In [13]:
clean_X = remove_stop_words(new_X)

In [14]:
data = list(zip(clean_X, y))

In [15]:
random.shuffle(data)

In [16]:
clean_X, y = zip(*data)

In [17]:
clean_X

('tener zarigüellas moda último película haber ver animación aparecer simpático animal vida ver libertad cierto familia raro patriarca tortugo tortuga hijo mofeta buscar amor mapache tratar ser adoptar ardilla espídica subfamilia zarigüellas puercoespín juguetón compartir tarea familiar cierto malo leche humano querer exterminar verdad fin película tratar extraño familia animal levantar hibernación descubrir encontrar rodear lujoso urbanización haber frondoso bosque haber quedar comida dejar aconsejar mapache obtener alimento humano peligro conllevar patriarca familia tortuga ver claro plan hacer tambalear unidadfamiliar película definición gustar si además ser bueno pues gustar caso encontrar película unidad familia diversidad igualdad progreso convivencia humano resto ser tierra trabajo equipo incluso amor ser saber película tener atufar moralina lado mayor poder decir ver valor inculcar disfrutar película animación infantil trama ser bastante pueril ser adecuar dadoque orientar niño

In [18]:
y

(4,
 3,
 2,
 4,
 4,
 2,
 4,
 3,
 5,
 2,
 3,
 2,
 3,
 1,
 4,
 4,
 4,
 4,
 1,
 3,
 3,
 4,
 4,
 2,
 2,
 2,
 3,
 2,
 3,
 2,
 3,
 2,
 3,
 4,
 1,
 4,
 2,
 2,
 2,
 1,
 4,
 4,
 1,
 2,
 5,
 2,
 5,
 5,
 4,
 4,
 3,
 1,
 3,
 2,
 1,
 4,
 2,
 3,
 4,
 4,
 3,
 2,
 2,
 2,
 3,
 4,
 4,
 3,
 3,
 3,
 4,
 3,
 3,
 2,
 2,
 2,
 1,
 2,
 4,
 3,
 5,
 3,
 5,
 5,
 2,
 4,
 2,
 3,
 4,
 2,
 3,
 2,
 4,
 3,
 5,
 4,
 5,
 2,
 3,
 3,
 4,
 3,
 4,
 3,
 5,
 3,
 3,
 3,
 4,
 4,
 2,
 5,
 2,
 3,
 4,
 3,
 3,
 3,
 4,
 2,
 2,
 3,
 4,
 1,
 3,
 3,
 3,
 4,
 3,
 2,
 4,
 4,
 3,
 5,
 3,
 4,
 1,
 5,
 3,
 1,
 4,
 2,
 3,
 1,
 2,
 3,
 3,
 2,
 5,
 3,
 3,
 3,
 5,
 2,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 3,
 2,
 5,
 5,
 1,
 3,
 3,
 4,
 4,
 2,
 4,
 5,
 1,
 3,
 3,
 2,
 2,
 3,
 3,
 2,
 3,
 4,
 2,
 2,
 4,
 4,
 4,
 1,
 4,
 2,
 4,
 2,
 3,
 5,
 5,
 4,
 3,
 4,
 3,
 3,
 1,
 3,
 2,
 4,
 3,
 2,
 4,
 3,
 2,
 1,
 1,
 3,
 2,
 3,
 3,
 4,
 2,
 2,
 3,
 3,
 3,
 3,
 4,
 5,
 1,
 3,
 3,
 1,
 2,
 3,
 1,
 4,
 1,
 2,
 5,
 4,
 3,
 3,
 3,
 4,
 5,
 3,
 5,
 4,
 1,
 1,
 2,
 3,


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [20]:
count_vect = CountVectorizer()

In [21]:
X_counts = count_vect.fit_transform(np.array(clean_X))

In [22]:
y = np.array(y)

In [23]:
X_counts.shape

(3878, 44980)

In [24]:
len(y)

3878

In [25]:
tfidf_transformer = TfidfTransformer()

In [26]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 14)

In [28]:
classifier = LogisticRegression()

In [29]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [30]:
y_pred = classifier.predict(X_test)

In [31]:
classifier.score(X_test, y_test)

0.42010309278350516

In [32]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  3  37  16   2   0]
 [  0  76 104  12   0]
 [  0  21 181  34   0]
 [  0  11 125  61   3]
 [  0   7  48  30   5]]


In [33]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.05      0.10        58
           2       0.50      0.40      0.44       192
           3       0.38      0.77      0.51       236
           4       0.44      0.30      0.36       200
           5       0.62      0.06      0.10        90

    accuracy                           0.42       776
   macro avg       0.59      0.32      0.30       776
weighted avg       0.50      0.42      0.38       776

