# Text Mining

## Load necessary packages

In [1]:
from pathlib import Path
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from unidecode import unidecode
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import string
import collections as ct
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import RSLPStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import gensim
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from math import ceil
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Mafalda\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

## Load corpora

In [4]:
def load_corpus(filename):
    corpus = Path(filename).read_text(encoding="utf8")
    return corpus

In [5]:
def load_corpora(filelist,author):
    files = []
    for file in filelist:
        location = "Corpora/train/"+file
        corpus = load_corpus(location)
        files.append(corpus)
    df = pd.DataFrame(files, columns=['Text'])
    df['Author'] = pd.Series([author for x in range(len(df.index))], index=df.index)
    return df

In [6]:
almada = [
    'AlmadaNegreiros/pg22615.txt',
    'AlmadaNegreiros/pg22730.txt',
    'AlmadaNegreiros/pg22801.txt',
    'AlmadaNegreiros/pg22802.txt',
    'AlmadaNegreiros/pg22969.txt',
    'AlmadaNegreiros/pg23133.txt',
    'AlmadaNegreiros/pg23620.txt',
    'AlmadaNegreiros/pg23879.txt',
    'AlmadaNegreiros/pg23961.txt'
]

corpora_almada = load_corpora(almada, 'Almada Negreiros')

In [7]:
camilo = [
    'CamiloCasteloBranco/24691-0.txt',
    'CamiloCasteloBranco/34756-0.txt',
    'CamiloCasteloBranco/pg16425.txt',
    'CamiloCasteloBranco/pg17927.txt',
    'CamiloCasteloBranco/pg19375.txt',
    'CamiloCasteloBranco/pg21406.txt',
    'CamiloCasteloBranco/pg23203.txt',
    'CamiloCasteloBranco/pg23345.txt',
    'CamiloCasteloBranco/pg23346.txt',
    'CamiloCasteloBranco/pg24339.txt',
    'CamiloCasteloBranco/pg25844.txt',
    'CamiloCasteloBranco/pg26017.txt',
    'CamiloCasteloBranco/pg26103.txt',
    'CamiloCasteloBranco/pg26110.txt',
    'CamiloCasteloBranco/pg26988.txt',
    'CamiloCasteloBranco/pg27364.txt',
    'CamiloCasteloBranco/pg27541.txt',
    'CamiloCasteloBranco/pg28310.txt',
    'CamiloCasteloBranco/pg31694.txt',
    'CamiloCasteloBranco/pg33788.txt',
]

corpora_camilo = load_corpora(camilo, 'Camilo Castelo Branco')

In [8]:
eca = [
    'EcaDeQueiros/pg18220.txt',
    'EcaDeQueiros/pg25641.txt',
    'EcaDeQueiros/pg27637.txt',
    'EcaDeQueiros/pg31347.txt',
    'EcaDeQueiros/pg40409.txt'
]

corpora_eca = load_corpora(eca, 'Eca de Queiros')

In [9]:
rodrigues_santos = [
    'JoseRodriguesSantos/A Filha Do Capitao - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/A Formula De Deus - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/A Mao do Diabo - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/A Vida Num Sopro - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/Furia Divina - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/O Anjo Branco - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/O Setimo Selo - Jose Rodrigues dos Santos.txt',
    'JoseRodriguesSantos/O ultimo Segredo - Jose Rodrigues dos Santos.txt'
]

corpora_rodrigues_santos = load_corpora(rodrigues_santos, 'Jose Rodrigues dos Santos')

In [10]:
saramago = [
    'JoseSaramago/A Caverna - Jose Saramago.txt',
    'JoseSaramago/As Intermitencias da Morte - Jose Saramago.txt',
    'JoseSaramago/Caim - Jose Saramago.txt',
    'JoseSaramago/Claraboia - Jose Saramago.txt',
    'JoseSaramago/Ensaio Sobre a Cegueira - Jose Saramago.txt',
    'JoseSaramago/Historia Do Cerco De Lisboa - Jose Saramago.txt',
    'JoseSaramago/Memorial Do Convento - Jose Saramago.txt',
    'JoseSaramago/O Ano Da Morte De Ricardo Reis - Jose Saramago.txt',
    'JoseSaramago/O Conto Da Ilha Desconhecida - Jose Saramago.txt',
    'JoseSaramago/O Homem Duplicado - Jose Saramago.txt',
    'JoseSaramago/Terra Do Pecado - Jose Saramago.txt',
    'JoseSaramago/Viagem Do Elefante - Jose Saramago.txt'
]

corpora_saramago = load_corpora(saramago, 'Jose Saramago')

In [11]:
luisa = [
    'LuisaMarquesSilva/ABelaHistoria.txt',
    'LuisaMarquesSilva/acabouSe.txt',
    'LuisaMarquesSilva/Botão.txt',
    'LuisaMarquesSilva/controlz.txt',
    'LuisaMarquesSilva/emedo.txt',
    'LuisaMarquesSilva/Lisboa2050.txt',
    'LuisaMarquesSilva/passeioInferno.txt',
    'LuisaMarquesSilva/rapsodiasemdo.txt',
    'LuisaMarquesSilva/UltimaHistoria.txt'
]

corpora_luisa = load_corpora(luisa, 'Luisa Marques Silva')

In [12]:
corpora = pd.concat([corpora_almada, corpora_camilo, corpora_eca, corpora_rodrigues_santos, corpora_saramago, corpora_luisa]).reset_index(drop = True)
corpora.head()

Unnamed: 0,Text,Author
0,Title: A Scena do Odio\n\nAuthor: José de Alma...,Almada Negreiros
1,Title: O Jardim da Pierrette\n\nAuthor: José d...,Almada Negreiros
2,\n\nTitle: A Invenção do Dia Claro\n\nAuthor: ...,Almada Negreiros
3,\nTitle: Litoral\n A Amadeo de Souza Car...,Almada Negreiros
4,\n\n\nEXPOSIÇÃO\n\n+amadeo\nde souza\ncardoso+...,Almada Negreiros


## Preprocessing

In [13]:
def preprocessing(dataframe,accents=False,punctuation=False,lowercase=False,tags=False,stemming=False,stemmer='snowball'):
    processed_corpus = []
    
    stop_words_pt = set(stopwords.words("portuguese", "english"))
    
    for i in tqdm(range(len(dataframe))):
        text = dataframe['Text'][i]
        
        #Remove accents
        if accents:
            text = unidecode(text)
        
        #remove punctuation
        if punctuation:
            text = re.sub('[^a-zA-Z]', ' ', text)
        
        #Convert to lowercase
        if lowercase:
            text = text.lower()

        #remove tags
        if tags:
            text = BeautifulSoup(text).get_text()
        
        #Convert to list from string
        text = text.split()

        #Stemming
        if stemming:
            if stemmer == 'snowball':
                stemmer_pt = SnowballStemmer('portuguese')
            elif stemmer == 'rslp':
                stemmer_pt = RSLPStemmer()
            
            text = [stemmer_pt.stem(word) for word in text if not word in stop_words_pt]
        
        text = " ".join(text)

        processed_corpus.append(text)
    return processed_corpus

In [14]:
cleaned_corpora = preprocessing(corpora,stemming=True,stemmer='rslp')

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))




In [15]:
corpora['Clean Text'] = pd.Series(cleaned_corpora, index = corpora.index)

In [16]:
corpora['Word List'] = corpora['Clean Text']

for i in tqdm(range(len(corpora))):
    mystr = corpora['Clean Text'][i]
    wordList = mystr.split()
    
    corpora['Word List'][i] = wordList

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))




In [17]:
corpora

Unnamed: 0,Text,Author,Clean Text,Word List
0,Title: A Scena do Odio\n\nAuthor: José de Alma...,Almada Negreiros,title: a scen odi author: josé alm negr releas...,"[title:, a, scen, odi, author:, josé, alm, neg..."
1,Title: O Jardim da Pierrette\n\nAuthor: José d...,Almada Negreiros,title: o jardim pierrett author: josé alm negr...,"[title:, o, jardim, pierrett, author:, josé, a..."
2,\n\nTitle: A Invenção do Dia Claro\n\nAuthor: ...,Almada Negreiros,title: a invenç dia clar author: josé alm negr...,"[title:, a, invenç, dia, clar, author:, josé, ..."
3,\nTitle: Litoral\n A Amadeo de Souza Car...,Almada Negreiros,title: litor a amade souz cardoz author: josé ...,"[title:, litor, a, amade, souz, cardoz, author..."
4,\n\n\nEXPOSIÇÃO\n\n+amadeo\nde souza\ncardoso+...,Almada Negreiros,expos +amade souz cardoso+ lig naval de lisbo ...,"[expos, +amade, souz, cardoso+, lig, naval, de..."
5,\n\n*JOSÉ DE ALMADA-NEGREIROS*\n\n\n*K4\n\no q...,Almada Negreiros,*josé de almada-negreiros* *k4 quadr azul* aca...,"[*josé, de, almada-negreiros*, *k4, quadr, azu..."
6,"\n\n*""ORPHEU""*\n\nREVISTA TRIMESTRAL DE LITERA...",Almada Negreiros,"*""orpheu""* revist trimestr de literat portug e...","[*""orpheu""*, revist, trimestr, de, literat, po..."
7,\n\n+a ENGOMADEIRA+\n\nNOVELA VULGAR LISBOETA\...,Almada Negreiros,+a engomadeira+ novel vulg lisboet +engomadeir...,"[+a, engomadeira+, novel, vulg, lisboet, +engo..."
8,\n+MANIFESTO+\n\n+ANTI-DANTAS+\n\nE\n\nPOR EXT...,Almada Negreiros,+manifesto+ +anti-dantas+ e por extens por jos...,"[+manifesto+, +anti-dantas+, e, por, extens, p..."
9,O vinho do Porto\n\nPROCESSO D'UMA BESTIALIDAD...,Camilo Castelo Branco,o vinh port process d'um bestial ingl expos a ...,"[o, vinh, port, process, d'um, bestial, ingl, ..."


In [18]:
corpora_500 = pd.DataFrame(columns=['Author','500 Word List','500 Clean Text'])

for i in tqdm(range(len(corpora))):
    text = corpora['Word List'][i]
    list_500 = [text[i:i+500] for i in range(0, len(text), 500)]
    text_500 = [" ".join(list) for list in list_500]
    
    d_500 = pd.DataFrame({'Author':corpora['Author'][i],'500 Word List':list_500,'500 Clean Text':text_500})
    corpora_500 = corpora_500.append(d_500,ignore_index=True)
    
corpora_500

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))




Unnamed: 0,Author,500 Word List,500 Clean Text
0,Almada Negreiros,"[title:, a, scen, odi, author:, josé, alm, neg...",title: a scen odi author: josé alm negr releas...
1,Almada Negreiros,"[branc, par, vict, torneios-lot, donzellas-glo...",branc par vict torneios-lot donzellas-glorias!...
2,Almada Negreiros,"[lua, enxovalh, vir, lavadeira!, larg, cidad, ...",lua enxovalh vir lavadeira! larg cidad foge! l...
3,Almada Negreiros,"[title:, o, jardim, pierrett, author:, josé, a...",title: o jardim pierrett author: josé alm negr...
4,Almada Negreiros,"[title:, a, invenç, dia, clar, author:, josé, ...",title: a invenç dia clar author: josé alm negr...
5,Almada Negreiros,"[compunh, palavras;, arranc, arvor, prens, ond...",compunh palavras; arranc arvor prens ond apert...
6,Almada Negreiros,"[est, hom, sós, raça--, phenicio!, cad, vint, ...",est hom sós raça-- phenicio! cad vint doi sign...
7,Almada Negreiros,"[ajuda-, ir, á, loucura., vae, tamb, pessoalme...","ajuda- ir á loucura. vae tamb pessoalmente, co..."
8,Almada Negreiros,"[cant, succed, ás, tangerin, rol, pró, mar:, t...",cant succed ás tangerin rol pró mar: tam tam-t...
9,Almada Negreiros,"[eu, ia, tão, contente!, ia, pens, ti, verb, s...",eu ia tão contente! ia pens ti verb sab verb g...


In [19]:
corpora = corpora_500

In [20]:
word_count = corpora['500 Clean Text'].apply(lambda x: len(str(x).split(" ")))
corpora['word_count_clean'] = word_count

corpora

Unnamed: 0,Author,500 Word List,500 Clean Text,word_count_clean
0,Almada Negreiros,"[title:, a, scen, odi, author:, josé, alm, neg...",title: a scen odi author: josé alm negr releas...,500
1,Almada Negreiros,"[branc, par, vict, torneios-lot, donzellas-glo...",branc par vict torneios-lot donzellas-glorias!...,500
2,Almada Negreiros,"[lua, enxovalh, vir, lavadeira!, larg, cidad, ...",lua enxovalh vir lavadeira! larg cidad foge! l...,201
3,Almada Negreiros,"[title:, o, jardim, pierrett, author:, josé, a...",title: o jardim pierrett author: josé alm negr...,205
4,Almada Negreiros,"[title:, a, invenç, dia, clar, author:, josé, ...",title: a invenç dia clar author: josé alm negr...,500
5,Almada Negreiros,"[compunh, palavras;, arranc, arvor, prens, ond...",compunh palavras; arranc arvor prens ond apert...,500
6,Almada Negreiros,"[est, hom, sós, raça--, phenicio!, cad, vint, ...",est hom sós raça-- phenicio! cad vint doi sign...,500
7,Almada Negreiros,"[ajuda-, ir, á, loucura., vae, tamb, pessoalme...","ajuda- ir á loucura. vae tamb pessoalmente, co...",500
8,Almada Negreiros,"[cant, succed, ás, tangerin, rol, pró, mar:, t...",cant succed ás tangerin rol pró mar: tam tam-t...,500
9,Almada Negreiros,"[eu, ia, tão, contente!, ia, pens, ti, verb, s...",eu ia tão contente! ia pens ti verb sab verb g...,500


In [21]:
corpora.groupby(['Author']).sum()

Unnamed: 0_level_0,word_count_clean
Author,Unnamed: 1_level_1
Almada Negreiros,29835
Camilo Castelo Branco,468084
Eca de Queiros,282941
Jose Rodrigues dos Santos,708762
Jose Saramago,586119
Luisa Marques Silva,26224


## Data exploration

In [22]:
corpora.word_count_clean.describe()

count    4237.000000
mean      496.097475
std        35.968519
min         5.000000
25%       500.000000
50%       500.000000
75%       500.000000
max       500.000000
Name: word_count_clean, dtype: float64

In [23]:
all_words = ' '.join(corpora['500 Clean Text']).split()

In [24]:
freq = pd.Series(all_words).value_counts()

In [25]:
freq[:20]

o       14669
tod     10142
a        9624
olh      7803
outr     7728
ser      7339
pod      7238
e        6698
est      6580
pass     6053
aind     5967
sab      5701
ell      5667
á        5523
não      5490
faz      5375
ond      5052
sobr     5020
cas      5016
hav      4978
dtype: int64

In [26]:
stopWords = set(stopwords.words("portuguese"))
stop_words = []
for word in stopWords:
    #word = unidecode(word)
    stop_words.append(word)

In [27]:
count = 0

for word in freq.index[:20]:
    if word in stop_words:
        count += 1
count

4

Only few if the most common words are stop words

In [28]:
def get_top_n_grams(corpus, top_k, n):
    """
    Function that receives a list of documents (corpus) and extracts
        the top k most frequent n-grams for that corpus.
        
    :param corpus: list of texts
    :param top_k: int with the number of n-grams that we want to extract
    :param n: n gram type to be considered 
             (if n=1 extracts unigrams, if n=2 extracts bigrams, ...)
             
    :return: Returns a sorted dataframe in which the first column 
        contains the extracted ngrams and the second column contains
        the respective counts
    """
    vec = CountVectorizer(ngram_range=(n, n), max_features=2000).fit(corpus)
    
    bag_of_words = vec.transform(corpus)
    
    sum_words = bag_of_words.sum(axis=0) 
    
    words_freq = []
    for word, idx in vec.vocabulary_.items():
        words_freq.append((word, sum_words[0, idx]))
        
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    top_df = pd.DataFrame(words_freq[:top_k])
    top_df.columns = ["Ngram", "Freq"]
    return top_df

In [29]:
def plot_frequencies(top_df):
    """
    Function that receives a dataframe from the "get_top_n_grams" function
    and plots the frequencies in a bar plot.
    """
    x_labels = top_df["Ngram"][:30]
    y_pos = np.arange(len(x_labels))
    values = top_df["Freq"][:30]
    plt.bar(y_pos, values, align='center', alpha=0.5)
    plt.xticks(y_pos, x_labels)
    plt.ylabel('Frequencies')
    plt.title('Words')
    plt.xticks(rotation=90)
    plt.show()

In [30]:
#for i in range(1,6):
#    top = get_top_n_grams(corpora["500 Clean Text"], top_k=20, n=i)
#    plot_frequencies(top)

From these graphics we can see that up to 2-grams we are catching expressions, but from 3-grams onwards we are just finding parts of sentences that are common but are not actual expressions that would be relevant. This is relevant because when we vectorize the texts to a Bag-of-Words or TF-IDf model we will want to catch the 2-grams but not the 3-grams.

## Tests

In [31]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [32]:
classifier = KNeighborsClassifier()
#classifier = MultinomialNB()
#classifier = SVC()
#classifier = DecisionTreeClassifier()
#classifier = LogisticRegression()

In [33]:
vectorizer = CountVectorizer(
    #max_df=0.7,
    #strip_accents='unicode',
    lowercase=False,
    #stop_words=stop_words,
    max_features=50000,
    ngram_range=(1,3)
)
vectorizer = TfidfVectorizer(
    #strip_accents='unicode',
    lowercase=False,
    #stop_words=stop_words,
    max_features=50000
)

In [34]:
accuracy = 0
recall = 0

for (train, test) in kf.split(corpora['500 Clean Text'],corpora['Author']):
    X_train = vectorizer.fit_transform(corpora['500 Clean Text'][train])
    X_test = vectorizer.transform(corpora['500 Clean Text'][test])
    y = corpora['Author']
    classifier.fit(X_train,y[train])
    prediction = classifier.predict(X_test)
    accuracy += accuracy_score(y[test],prediction)*100
    recall += recall_score(y[test],prediction,average=None)*100
    print(classification_report(y_true = y[test], y_pred = prediction))
    print(confusion_matrix(y_true = y[test], y_pred = prediction))
    
print(accuracy/5)
print(recall/5)

                           precision    recall  f1-score   support

         Almada Negreiros       1.00      0.77      0.87        13
    Camilo Castelo Branco       0.97      1.00      0.99       189
           Eca de Queiros       1.00      0.99      1.00       114
Jose Rodrigues dos Santos       0.99      0.99      0.99       285
            Jose Saramago       0.99      0.99      0.99       235
      Luisa Marques Silva       1.00      1.00      1.00        12

                 accuracy                           0.99       848
                macro avg       0.99      0.96      0.97       848
             weighted avg       0.99      0.99      0.99       848

[[ 10   2   0   0   1   0]
 [  0 189   0   0   0   0]
 [  0   1 113   0   0   0]
 [  0   2   0 282   1   0]
 [  0   0   0   2 233   0]
 [  0   0   0   0   0  12]]
                           precision    recall  f1-score   support

         Almada Negreiros       1.00      0.92      0.96        13
    Camilo Castelo Branco    

## Train the chosen model

In [35]:
X = corpora['500 Clean Text']
y = corpora['Author']

X_cv = vectorizer.fit_transform(X)
classifier.fit(X_cv,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Load new text

In [36]:
def load_new_corpora(filelist):
    files = []
    for file in filelist:
        location = "Corpora/test/"+file
        corpus = load_corpus(location)
        files.append(corpus)
    df = pd.DataFrame(files, columns=['Text'])
    return df

In [37]:
files = [
    '500Palavras/text1.txt',
    '500Palavras/text2.txt',
    '500Palavras/text3.txt',
    '500Palavras/text4.txt',
    '500Palavras/text5.txt',
    '500Palavras/text6.txt',
    '1000Palavras/text1.txt',
    '1000Palavras/text2.txt',
    '1000Palavras/text3.txt',
    '1000Palavras/text4.txt',
    '1000Palavras/text5.txt',
    '1000Palavras/text6.txt',
]

new_corpora = load_new_corpora(files)

In [38]:
new_corpora

Unnamed: 0,Text
0,"Depois, pouco a pouco, a tranquilidade regress..."
1,Justamente como se eu tivesse tido a ideia de ...
2,"Quase um mês depois, a época de exames aproxim..."
3,"Agora, porém, era sem fervor, arrastadamente, ..."
4,"O cahos de cima a descer, a descer com a morta..."
5,"""O Senhor ensina pela pena o que o homem não s..."
6,"Depois, pouco a pouco, a tranquilidade regress..."
7,Justamente como se eu tivesse tido a ideia de ...
8,"Quase um mês depois, a época de exames aproxim..."
9,"Agora, porém, era sem fervor, arrastadamente, ..."


In [39]:
cleaned_corpora = preprocessing(new_corpora,stemming=True,stemmer='rslp')

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [40]:
new_corpora['Clean Text'] = pd.Series(cleaned_corpora, index = new_corpora.index)

## Classify the new text

In [41]:
new_cv = vectorizer.transform(new_corpora['Clean Text'])
prediction = classifier.predict(new_cv)

In [42]:
prediction

array(['Jose Saramago', 'Almada Negreiros', 'Jose Rodrigues dos Santos',
       'Eca de Queiros', 'Camilo Castelo Branco',
       'Jose Rodrigues dos Santos', 'Jose Saramago', 'Almada Negreiros',
       'Jose Saramago', 'Eca de Queiros', 'Camilo Castelo Branco',
       'Jose Rodrigues dos Santos'], dtype=object)