Sebastian Ruiz Uvalle

6BV1

Ing. en Inteligencia Artificial

19 de abril de 2023

Este programa se encarga de la categorización de muestras de 5 libros diferentes obtenidos de Project Gutenberg (www.gutenberg.org), siendo el dataset de 2166 observaciones.

In [1]:
# Descarga de los titulos disponibles para descargar del Proyecto Gutenberg

import numpy as np
import nltk

nltk.download('gutenberg')
books_names=nltk.corpus.gutenberg.fileids()
books_names

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [2]:
# Selección de los libros que vamos a usar para la practica

books_idx=[1,3,5,7,9]
selected_books=[]
for idx in books_idx:
  selected_books.append(books_names[idx])
print(selected_books)

['austen-persuasion.txt', 'bible-kjv.txt', 'bryant-stories.txt', 'carroll-alice.txt', 'chesterton-brown.txt']


In [3]:
# Obtención del corpus de los libros seleccionados y su visualización

book_contents=[]
for book_name in selected_books:
    book_contents.append(nltk.corpus.gutenberg.raw(book_name))
book_contents[0][1:500]

'Persuasion by Jane Austen 1818]\n\n\nChapter 1\n\n\nSir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,\nfor his own amusement, never took up any book but the Baronetage;\nthere he found occupation for an idle hour, and consolation in a\ndistressed one; there his faculties were roused into admiration and\nrespect, by contemplating the limited remnant of the earliest patents;\nthere any unwelcome sensations, arising from domestic affairs\nchanged naturally into pity and contempt as he turn'

In [4]:
# Descarga de las bibliotecas necesarias para poder remover palabras irrelevantes, sustituir palabras
# flexionadas por un termino general y sustituir terminos por su raiz gramatical
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [20]:
import re

def get_wordnet_pos(word):
    # Mapeamos la etiqueta POS al primer caracter lemmatize() acepte
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def clean_text(text):
    # Definicion de patron de palabras para la mantencion de estas y pasado de texto a minusculas
    lemmatizer = WordNetLemmatizer()
    sub_pattern = r'[^A-Za-z]'
    split_pattern = r"\s+"
    stop_words = stopwords.words('english') + ['chapter','never','ever','couldnot','wouldnot','could','would','us',"i'm","you'd"]
    lower_book = text.lower()

    # Reemplazando todos los caracteres, excepto los que esten en los patrones definidos en sub_patten
    # a espacios, tokenizado de los documentos y lematizacion 
    filtered_book = re.sub(sub_pattern,' ',lower_book).lstrip().rstrip()
    filtered_book = word_tokenize(filtered_book)
    filtered_book = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_book if word not in stop_words]

    return filtered_book

In [21]:
# Obtencion del corpus de los 5 libros ya limpiados en cleaned_boos_contents
cleaned_books_contents=[]
for book in book_contents :
    cleaned_books_contents.append(clean_text(book))
cleaned_books_contents[0][1:30]

['jane',
 'austen',
 'sir',
 'walter',
 'elliot',
 'kellynch',
 'hall',
 'somersetshire',
 'man',
 'amusement',
 'take',
 'book',
 'baronetage',
 'found',
 'occupation',
 'idle',
 'hour',
 'consolation',
 'distressed',
 'one',
 'faculty',
 'rouse',
 'admiration',
 'respect',
 'contemplate',
 'limited',
 'remnant',
 'early',
 'patent']

In [22]:
# Vista de cuantas palabras tiene cada documento
for i in range(len(cleaned_books_contents)):
    size = len(cleaned_books_contents[i])
    print(size)

37186
372131
21357
11984
34882


In [23]:
# Creacion de 500 muestras aproximadamente aleatorias de cada libro donde cada muestra contiene 50 palabras
def book_samples(book,n_samples) :
    import random
    samples=[]
    start=0
    while start +n_samples < len(book)-1:
        temp1=""
        for j in range(start,start+n_samples):
            temp1+= book[j] + " "
        samples.append(temp1)
        start+=n_samples
    random_samples_index=random.sample(range(0,len(samples)), k=min(500,len(samples)))
    partitions=[]
    for idx in random_samples_index :
        partitions.append(samples[idx])
    return partitions

In [24]:
samples_of_books=[]
for cleaned_book in cleaned_books_contents :
    samples_of_books.append(book_samples(cleaned_book,50))
samples_of_books[0][0]

'leave much time see new place deduct seven hour nature country require go return consequently stay night expect back till next day dinner felt considerable amendment though met great house rather early breakfast hour set punctually much past noon two carriage mr musgrove coach contain four lady charles curricle drove '

In [29]:
import pandas as pd
data_frame =pd.DataFrame()
data_frame['Sample']=[item for sublist in samples_of_books for item in sublist]
target=[[selected_books[i]]*min(500,len(samples_of_books[i])) for i in range(len(selected_books)) ]
data_frame['Book Name']=[item for sublist in target for item in sublist]
data_frame['Book Name'].unique()

array(['austen-persuasion.txt', 'bible-kjv.txt', 'bryant-stories.txt',
       'carroll-alice.txt', 'chesterton-brown.txt'], dtype=object)

In [30]:
from sklearn.utils import shuffle

data_frame = shuffle(data_frame)
data_frame

Unnamed: 0,Sample,Book Name
323,give information captain wentworth visit stay ...,austen-persuasion.txt
1537,felt little nervous might end know say alice g...,carroll-alice.txt
1739,large fortune sure meant contempt arthur may s...,chesterton-brown.txt
1469,go executioner king queen talk rest quite sile...,carroll-alice.txt
1500,remark variation shall sit say day day say ali...,carroll-alice.txt
...,...,...
1755,dash first negro cry black flambeau fanshaw bo...,chesterton-brown.txt
1105,funny sight little jack rollaround lie trundle...,bryant-stories.txt
1876,saint science large dare cosmic theory adverti...,chesterton-brown.txt
2108,want interview champion interviewed nearly eve...,chesterton-brown.txt


In [31]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data_frame['Book Name'])
data_frame['Book Name'] = y

In [32]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data_frame,test_size=0.2,random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(data_frame['Sample'], data_frame['Book Name'], random_state = 0, test_size=0.2)

df_train

Unnamed: 0,Sample,Book Name
114,walk anne always thought style intercourse hig...,0
1745,duke give away hear time nearly lose estate pr...,4
1134,path row fir tree small none little little fir...,2
314,pride delicacy others small knowledge afterwar...,0
53,sensation comfort perhaps likely knew first ut...,0
...,...,...
1278,jackal friend brahmin stood roadside say brahm...,2
208,suddenly rouse little reverie come find recoll...,0
1838,rule throughout terrible tidiness unlike terri...,4
89,fate conclusion whole woman sensible captain w...,0


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_ngram(n_gram,X_train=X_train,X_test=X_test):
    vectorizer = TfidfVectorizer(ngram_range=(1,n_gram))
    x_train_vec = vectorizer.fit_transform(X_train)
    x_test_vec = vectorizer.transform(X_test)
    return x_train_vec,x_test_vec

X_train1g_cv, X_test1g_cv = tfidf_ngram(1,X_train=X_train,X_test=X_test)
X_train2g_cv, X_test2g_cv = tfidf_ngram(2,X_train=X_train,X_test=X_test)

print(X_train1g_cv)
print(X_test1g_cv)
print(X_train2g_cv)
print(X_test2g_cv)

  (0, 7329)	0.11360633767641247
  (0, 3025)	0.13140976764301973
  (0, 3584)	0.16499126869683506
  (0, 3107)	0.20813378033841853
  (0, 5705)	0.20813378033841853
  (0, 3412)	0.16781555737439977
  (0, 1578)	0.15795834543420023
  (0, 200)	0.11741198535147748
  (0, 6014)	0.16010396622575146
  (0, 3326)	0.07576426928060367
  (0, 3409)	0.1786844346461899
  (0, 1914)	0.1132150029942142
  (0, 5914)	0.11787021787136046
  (0, 3020)	0.1492350889539613
  (0, 6905)	0.16243638881610037
  (0, 1168)	0.1835717371172735
  (0, 7101)	0.10794199675149765
  (0, 5529)	0.1835717371172735
  (0, 7309)	0.15795834543420023
  (0, 2865)	0.1541223914250449
  (0, 5360)	0.08968256150522701
  (0, 3836)	0.11741198535147748
  (0, 3528)	0.10436256096443605
  (0, 3196)	0.12239030981034939
  (0, 7036)	0.11360633767641247
  :	:
  (1731, 7426)	0.09731536780593303
  (1731, 8531)	0.6367277744119468
  (1731, 280)	0.07711789346749164
  (1731, 8601)	0.0751966187963927
  (1731, 8540)	0.16640139635941123
  (1731, 4303)	0.095211755317

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

text_embedding = {
    'TF_IDF 1_gram':(X_train1g_cv,X_test1g_cv),
    'TF_IDF 2_gram':(X_train2g_cv,X_test2g_cv)
}

models = [
          LinearRegression(),
          BernoulliNB(),
          GaussianNB(),
          KNeighborsClassifier()
          ]

results_dict={'Model Name':[],'Embedding type':[],'Testing Accuracy':[],'Cross Validation':[]}

for model in models:
    for embedding_vector in text_embedding.keys():
        train = text_embedding[embedding_vector][0].toarray()
        test = text_embedding[embedding_vector][1].toarray()
        model.fit(train, y_train)

        results_dict['Model Name'].append(type(model).__name__)
        results_dict['Embedding type'].append(embedding_vector)

        test_acc = model.score(test, y_test)
        results_dict['Testing Accuracy'].append(test_acc)

        score = cross_val_score(model,test,y_test, scoring='r2')
        results_dict['Cross Validation'].append(score.mean())

results_df=pd.DataFrame(results_dict)

results_df

Unnamed: 0,Model Name,Embedding type,Testing Accuracy,Cross Validation
0,LinearRegression,TF_IDF 1_gram,0.837651,0.716048
1,LinearRegression,TF_IDF 2_gram,0.82697,0.691193
2,BernoulliNB,TF_IDF 1_gram,0.983871,-0.099754
3,BernoulliNB,TF_IDF 2_gram,0.824885,-1.488358
4,GaussianNB,TF_IDF 1_gram,0.940092,0.784628
5,GaussianNB,TF_IDF 2_gram,0.970046,0.821021
6,KNeighborsClassifier,TF_IDF 1_gram,0.9447,0.66144
7,KNeighborsClassifier,TF_IDF 2_gram,0.940092,0.723574


In [42]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
train_data = tokenizer.texts_to_sequences(X_train)
train_data = tokenizer.sequences_to_matrix(train_data, mode='binary')

tokenizer.fit_on_texts(X_test)
test_data = tokenizer.texts_to_sequences(X_test)
test_data = tokenizer.sequences_to_matrix(test_data, mode='binary')

train_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [43]:
test_data

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 1.]])