# Preprocessing, Classification and Clustering

On this notebook is developed the preprocessing and the application of classification and clustering algorithms to film reviews.

In [1]:
import numpy as np
import pandas as pd
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs

In [2]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [6]:
def join_sentence(lines):
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [7]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [8]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [9]:
X, y = extract_lines('./../corpusCriticasCine/corpusCriticasCine/')

In [10]:
len(X)

3878

In [11]:
len(y)

3878

In [12]:
tokenized_X = tokenize_lines_by_words(X)

In [13]:
new_X = clean_alphabetic_text_lines(tokenized_X)

In [14]:
clean_X = remove_stop_words(new_X)
#clean_X = new_X

In [15]:
data = list(zip(clean_X, y))

In [16]:
random.shuffle(data)

In [17]:
clean_X, y = zip(*data)

In [18]:
set(y)

{1, 2, 3, 4, 5}

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [20]:
count_vect = CountVectorizer()

In [21]:
X_counts = count_vect.fit_transform(np.array(clean_X))

In [22]:
y = np.array(y)

In [23]:
X_counts.shape

(3878, 44980)

In [24]:
len(y)

3878

In [25]:
tfidf_transformer = TfidfTransformer()

In [26]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.1)

# Linear SVC

In [28]:
from sklearn.svm import LinearSVC

In [29]:
classifier = LinearSVC()
classifier.fit(X_train, y_train)
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

LinearSVC(C=1)

In [30]:
y_pred = classifier.predict(X_test)

In [31]:
classifier.score(X_test, y_test)

0.4845360824742268

In [32]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 9 18  7  0  0]
 [ 6 40 27  5  2]
 [ 2 23 82 36  3]
 [ 0  8 26 42 13]
 [ 0  4  7 13 15]]


In [33]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.53      0.26      0.35        34
           2       0.43      0.50      0.46        80
           3       0.55      0.56      0.56       146
           4       0.44      0.47      0.45        89
           5       0.45      0.38      0.42        39

    accuracy                           0.48       388
   macro avg       0.48      0.44      0.45       388
weighted avg       0.49      0.48      0.48       388



# Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [36]:
y_pred = classifier.predict(X_test)

In [37]:
classifier.score(X_test, y_test)

0.4639175257731959

In [38]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  3  21  10   0   0]
 [  0  35  42   3   0]
 [  0  19 103  23   1]
 [  0   3  42  39   5]
 [  0   2  16  21   0]]


In [39]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.09      0.16        34
           2       0.44      0.44      0.44        80
           3       0.48      0.71      0.57       146
           4       0.45      0.44      0.45        89
           5       0.00      0.00      0.00        39

    accuracy                           0.46       388
   macro avg       0.47      0.33      0.32       388
weighted avg       0.46      0.46      0.42       388



# Multinomial NB

In [40]:
from sklearn.naive_bayes import MultinomialNB

In [41]:
classifier = MultinomialNB()

In [42]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [43]:
y_pred = classifier.predict(X_test)

In [44]:
classifier.score(X_test, y_test)

0.37628865979381443

In [45]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  0   0  34   0   0]
 [  0   0  80   0   0]
 [  0   0 146   0   0]
 [  0   0  89   0   0]
 [  0   0  39   0   0]]


In [46]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.00      0.00      0.00        80
           3       0.38      1.00      0.55       146
           4       0.00      0.00      0.00        89
           5       0.00      0.00      0.00        39

    accuracy                           0.38       388
   macro avg       0.08      0.20      0.11       388
weighted avg       0.14      0.38      0.21       388



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SGDClassifier

In [47]:
from sklearn.linear_model import SGDClassifier

In [48]:
classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=10, tol=None)

In [49]:
classifier.fit(X_train, y_train)

SGDClassifier(alpha=0.001, max_iter=10, random_state=42, tol=None)

In [50]:
y_pred = classifier.predict(X_test)

In [51]:
classifier.score(X_test, y_test)

0.46649484536082475

In [52]:
print(metrics.confusion_matrix(y_test, y_pred))

[[10 15  8  1  0]
 [ 7 40 27  4  2]
 [ 2 26 85 29  4]
 [ 1 11 34 29 14]
 [ 1  6  6  9 17]]


In [53]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.48      0.29      0.36        34
           2       0.41      0.50      0.45        80
           3       0.53      0.58      0.56       146
           4       0.40      0.33      0.36        89
           5       0.46      0.44      0.45        39

    accuracy                           0.47       388
   macro avg       0.46      0.43      0.44       388
weighted avg       0.46      0.47      0.46       388



## KNN

In [54]:
from sklearn.neighbors import KNeighborsClassifier

In [55]:
classifier = KNeighborsClassifier(n_neighbors = 5)

In [56]:
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [57]:
y_pred = classifier.predict(X_test)

In [58]:
classifier.score(X_test, y_test)

0.39690721649484534

In [59]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 8 17  5  2  2]
 [ 8 55 13  1  3]
 [ 9 59 54 21  3]
 [ 3 29 24 24  9]
 [ 3  8  8  7 13]]


In [60]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.26      0.24      0.25        34
           2       0.33      0.69      0.44        80
           3       0.52      0.37      0.43       146
           4       0.44      0.27      0.33        89
           5       0.43      0.33      0.38        39

    accuracy                           0.40       388
   macro avg       0.39      0.38      0.37       388
weighted avg       0.43      0.40      0.39       388

