# Preprocessing, Classification and Clustering

On this notebook is developed the preprocessing and the application of classification and clustering algorithms to film reviews.

In [1]:
import numpy as np
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs

In [2]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [6]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [7]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [8]:
X, y = extract_lines('./../corpusCriticasCine/corpusCriticasCine/')

In [9]:
len(X)

3878

In [10]:
len(y)

3878

In [11]:
tokenized_X = tokenize_lines_by_words(X)

In [12]:
new_X = clean_alphabetic_text_lines(tokenized_X)

In [13]:
clean_X = remove_stop_words(new_X)

In [14]:
data = list(zip(clean_X, y))

In [15]:
random.shuffle(data)

In [16]:
clean_X, y = zip(*data)

In [17]:
clean_X

('haber determinar actor director guionista tener asociar mentalmente tipo cine programa parecer inconcebible poder hacer alejar aquel encasillar luispiedrahita rodrigosopeña siempre haber mover humor ser colaborador principal programa cómico elclubdelacomedia x aquí elhormiguero embargo incursión forma conjunto cine dejardelado comedia lanzar género totalmente oponer cine suspense habitacióndefermat ser película atraer interesante planteamiento inicial matemático encerrar habitación salida ir encoger pocoapoco suponer tipo suspense distinto soler ofrecer cine embargo minuto descubrir principal fallo film ser exceptuar santimillán haber crear alejosauras elenaballesteros incluo lluíshomar ser brillante matemático defecto nosólo hallar elección actor interpretación sinoque deber personaje mal construir peor desarrollado atmósfera lograr aportar inquietud agobio justo cinta transmitir sensación ahogo atrapar completo espectador ay atmósfera sólo mantener medio hora mismo durar originalid

In [18]:
y

(2,
 4,
 3,
 2,
 4,
 1,
 2,
 2,
 3,
 3,
 2,
 1,
 4,
 1,
 4,
 2,
 3,
 3,
 3,
 3,
 2,
 4,
 5,
 4,
 4,
 3,
 4,
 3,
 3,
 1,
 1,
 4,
 2,
 4,
 1,
 5,
 2,
 1,
 2,
 5,
 4,
 4,
 3,
 5,
 3,
 1,
 2,
 4,
 2,
 1,
 2,
 4,
 3,
 3,
 5,
 1,
 3,
 3,
 5,
 2,
 4,
 2,
 3,
 3,
 2,
 2,
 3,
 2,
 2,
 3,
 4,
 2,
 5,
 4,
 3,
 4,
 4,
 3,
 3,
 3,
 2,
 4,
 3,
 5,
 4,
 3,
 2,
 4,
 3,
 4,
 3,
 4,
 2,
 2,
 1,
 2,
 1,
 3,
 3,
 3,
 3,
 3,
 2,
 4,
 1,
 1,
 4,
 3,
 4,
 3,
 2,
 4,
 4,
 4,
 4,
 1,
 3,
 1,
 3,
 2,
 1,
 3,
 3,
 3,
 2,
 5,
 3,
 4,
 3,
 3,
 1,
 2,
 5,
 1,
 3,
 3,
 3,
 4,
 3,
 3,
 2,
 4,
 4,
 1,
 4,
 2,
 2,
 2,
 2,
 3,
 2,
 1,
 4,
 3,
 4,
 2,
 4,
 3,
 4,
 4,
 3,
 4,
 3,
 4,
 3,
 2,
 1,
 2,
 3,
 3,
 1,
 1,
 2,
 5,
 3,
 3,
 2,
 2,
 4,
 4,
 4,
 4,
 3,
 2,
 2,
 4,
 4,
 3,
 3,
 1,
 1,
 4,
 3,
 3,
 3,
 3,
 4,
 5,
 2,
 3,
 5,
 4,
 2,
 4,
 2,
 3,
 3,
 3,
 4,
 4,
 4,
 2,
 4,
 4,
 3,
 2,
 5,
 2,
 3,
 4,
 5,
 3,
 5,
 3,
 4,
 1,
 2,
 3,
 4,
 3,
 4,
 3,
 5,
 5,
 3,
 3,
 2,
 3,
 3,
 1,
 1,
 3,
 4,
 2,
 1,
 3,
 3,
 3,
 2,
 4,


In [19]:
set(y)

{1, 2, 3, 4, 5}

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics

In [21]:
count_vect = CountVectorizer()

In [22]:
X_counts = count_vect.fit_transform(np.array(clean_X))

In [23]:
y = np.array(y)

In [24]:
X_counts.shape

(3878, 44980)

In [25]:
len(y)

3878

In [26]:
tfidf_transformer = TfidfTransformer()

In [27]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 14)

# Logistic Regression

### Prediction of training data

In [29]:
classifier = LogisticRegression()

In [30]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [31]:
y_pred = classifier.predict(X_train)

In [32]:
classifier.score(X_train, y_train)

0.8971631205673759

In [33]:
print(metrics.confusion_matrix(y_train, y_pred))

[[151  76  47   4   0]
 [  0 724  24   2   0]
 [  0   6 989   3   0]
 [  0   4  50 641   0]
 [  0   8  63  32 278]]


In [34]:
print(metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.54      0.70       278
           2       0.89      0.97      0.92       750
           3       0.84      0.99      0.91       998
           4       0.94      0.92      0.93       695
           5       1.00      0.73      0.84       381

    accuracy                           0.90      3102
   macro avg       0.93      0.83      0.86      3102
weighted avg       0.91      0.90      0.89      3102



### Prediction of test data

In [35]:
classifier = LogisticRegression()

In [36]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [37]:
y_pred = classifier.predict(X_test)

In [38]:
classifier.score(X_test, y_test)

0.4213917525773196

In [39]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  3  52  16   1   1]
 [  0  65 100   8   0]
 [  0  34 192  29   0]
 [  0  11 122  60   2]
 [  0   6  28  39   7]]


In [40]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.04      0.08        73
           2       0.39      0.38      0.38       173
           3       0.42      0.75      0.54       255
           4       0.44      0.31      0.36       195
           5       0.70      0.09      0.16        80

    accuracy                           0.42       776
   macro avg       0.59      0.31      0.30       776
weighted avg       0.50      0.42      0.38       776



## Multinomial NB

### Prediction of training data

In [41]:
classifier = MultinomialNB()

In [42]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [43]:
y_pred = classifier.predict(X_train)

In [44]:
classifier.score(X_train, y_train)

0.33268858800773693

In [45]:
print(metrics.confusion_matrix(y_train, y_pred))

[[  1   0 277   0   0]
 [  0  29 721   0   0]
 [  0   0 998   0   0]
 [  0   0 691   4   0]
 [  0   0 381   0   0]]


In [46]:
print(metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.00      0.01       278
           2       1.00      0.04      0.07       750
           3       0.33      1.00      0.49       998
           4       1.00      0.01      0.01       695
           5       0.00      0.00      0.00       381

    accuracy                           0.33      3102
   macro avg       0.67      0.21      0.12      3102
weighted avg       0.66      0.33      0.18      3102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Prediction of test data

In [47]:
classifier = MultinomialNB()

In [48]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [49]:
y_pred = classifier.predict(X_test)

In [50]:
classifier.score(X_test, y_test)

0.32989690721649484

In [51]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  0   0  73   0   0]
 [  0   1 172   0   0]
 [  0   0 255   0   0]
 [  0   0 195   0   0]
 [  0   0  80   0   0]]


In [52]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        73
           2       1.00      0.01      0.01       173
           3       0.33      1.00      0.50       255
           4       0.00      0.00      0.00       195
           5       0.00      0.00      0.00        80

    accuracy                           0.33       776
   macro avg       0.27      0.20      0.10       776
weighted avg       0.33      0.33      0.17       776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## K-means

In [53]:
classifier = KMeans(n_clusters = 5)

In [54]:
classifier.fit(X_tfidf)

KMeans(n_clusters=5)

In [55]:
y_pred = classifier.predict(X_test)

In [56]:
y_pred

array([4, 1, 3, 2, 4, 2, 1, 2, 0, 2, 3, 0, 1, 3, 3, 3, 3, 0, 2, 2, 3, 3,
       3, 2, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 3, 1, 2, 1, 3, 3, 2, 2, 1, 2,
       2, 1, 1, 2, 2, 1, 3, 3, 4, 2, 1, 3, 2, 2, 3, 0, 2, 3, 3, 1, 3, 2,
       0, 3, 3, 0, 3, 1, 2, 2, 2, 3, 2, 0, 3, 1, 1, 1, 3, 3, 1, 1, 3, 4,
       3, 3, 3, 3, 3, 4, 3, 2, 1, 2, 3, 2, 2, 1, 2, 1, 1, 3, 3, 0, 2, 3,
       3, 3, 2, 4, 1, 2, 3, 1, 1, 3, 2, 0, 1, 2, 2, 3, 4, 2, 2, 3, 2, 3,
       3, 3, 3, 3, 3, 3, 2, 1, 1, 2, 0, 3, 0, 0, 2, 3, 1, 3, 2, 4, 3, 1,
       3, 3, 1, 2, 3, 0, 1, 1, 2, 2, 1, 3, 2, 4, 3, 1, 2, 3, 3, 3, 1, 0,
       3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 2, 2, 3, 0, 2, 3, 0, 4, 3, 2,
       2, 3, 3, 2, 2, 1, 0, 3, 3, 0, 2, 0, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3,
       2, 3, 3, 3, 3, 2, 2, 3, 0, 0, 3, 2, 2, 2, 1, 2, 1, 2, 2, 2, 4, 3,
       3, 2, 0, 3, 2, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 3, 1, 2, 2, 2, 3, 3,
       2, 3, 2, 3, 1, 2, 2, 2, 1, 0, 3, 3, 0, 4, 3, 4, 1, 0, 3, 1, 2, 3,
       2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 3, 2,

In [57]:
classifier.score(X_test)

-715.8488977882561

In [58]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  0   0   0   0   0   0]
 [  7  19  26  15   6   0]
 [ 10  32  58  66   7   0]
 [ 17  49  68 110  11   0]
 [ 11  30  51  92  11   0]
 [  7   9  29  30   5   0]]


In [59]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.14      0.26      0.18        73
           2       0.25      0.34      0.29       173
           3       0.35      0.43      0.39       255
           4       0.28      0.06      0.09       195
           5       0.00      0.00      0.00        80

    accuracy                           0.26       776
   macro avg       0.17      0.18      0.16       776
weighted avg       0.25      0.26      0.23       776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
