# **AS02: Representação Textual**
Esta é a tarefa AS02: Representação Textual, uma atividade prática que estimula o aluno a absorver conceitos básicos de mineração e análise de texto.

### **Libraries**

In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import spacy
import sklearn
import nltk
import unicodedata
import re
import scipy.sparse as sp
import nltk.tokenize as tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **Get and Save Data**

In [2]:
# Get Data
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_train

# Get categories from text
categories = list(newsgroups_train.target_names)

df = pd.DataFrame()
df['text'] = categories
df.to_csv("20_newsgroups_as_table.csv", index=False)

df

Unnamed: 0,text
0,alt.atheism
1,comp.graphics
2,comp.os.ms-windows.misc
3,comp.sys.ibm.pc.hardware
4,comp.sys.mac.hardware
5,comp.windows.x
6,misc.forsale
7,rec.autos
8,rec.motorcycles
9,rec.sport.baseball


### **Load Data**

In [3]:
df = pd.read_csv("20_newsgroups_as_table.csv")
df

Unnamed: 0,text
0,alt.atheism
1,comp.graphics
2,comp.os.ms-windows.misc
3,comp.sys.ibm.pc.hardware
4,comp.sys.mac.hardware
5,comp.windows.x
6,misc.forsale
7,rec.autos
8,rec.motorcycles
9,rec.sport.baseball


### **Preprocessing Text**

In [4]:
# Normalization
def normalization(text):
    # Lower case
    lower_text = text.lower()
    
    # Replace . by space
    lower_text = lower_text.replace('.', ' ')
    
    # Remove accents
    nfkd_form = unicodedata.normalize('NFKD', lower_text)
    ascii_text =  "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    
    # Acronym normalization
    text_acronym = re.sub('\.(?!(\S[^. ])|\d)', '', ascii_text)
    
    # Remove \n
    text_acronym = text_acronym.replace("\n", " ")
    
    # Ponctuation remove
    text_pontuation = re.sub(r"(?<!\$\d)(?<!\£\d)(?<![0-9])([.,;?!()\[\]{}<>/\\|@#%^&*+=:\"'])", '', text_acronym)
    text_pontuation
    
    # Special character remove
    text_special_char = re.sub('(?<!\d)[.,:!?\'\(\)#:-](?!\d)', '', text_pontuation) # special character removal
    text_special_char = re.sub(' +', ' ', text_special_char) # extra spaces removal
    
    return text_special_char

# StopWords remove
def stop_words_function(text):
    return [word for word in text if word not in stopwords]

# Lemmatization
def lemmatization(text):
    return [lemmatizer.lemmatize(word) for word in text]

# Stemming
def steamming(text):
    return [stemmer.stem(word) for word in text]

In [5]:
# Apply all functions to preprocess text
df['text_normalized'] = df['text'].apply(normalization)
df['text_tokenized'] = df['text_normalized'].apply(tokenizer.word_tokenize)
df['text_stopwords'] = df['text_tokenized'].apply(stop_words_function)
df['text_lemmatized'] = df['text_stopwords'].apply(stop_words_function)
df['text_stemmed'] = df['text_lemmatized'].apply(stop_words_function)
df['text_to_string'] = df['text_stemmed'].apply(' '.join)
df

Unnamed: 0,text,text_normalized,text_tokenized,text_stopwords,text_lemmatized,text_stemmed,text_to_string
0,alt.atheism,alt atheism,"[alt, atheism]","[alt, atheism]","[alt, atheism]","[alt, atheism]",alt atheism
1,comp.graphics,comp graphics,"[comp, graphics]","[comp, graphics]","[comp, graphics]","[comp, graphics]",comp graphics
2,comp.os.ms-windows.misc,comp os mswindows misc,"[comp, os, mswindows, misc]","[comp, os, mswindows, misc]","[comp, os, mswindows, misc]","[comp, os, mswindows, misc]",comp os mswindows misc
3,comp.sys.ibm.pc.hardware,comp sys ibm pc hardware,"[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]",comp sys ibm pc hardware
4,comp.sys.mac.hardware,comp sys mac hardware,"[comp, sys, mac, hardware]","[comp, sys, mac, hardware]","[comp, sys, mac, hardware]","[comp, sys, mac, hardware]",comp sys mac hardware
5,comp.windows.x,comp windows x,"[comp, windows, x]","[comp, windows, x]","[comp, windows, x]","[comp, windows, x]",comp windows x
6,misc.forsale,misc forsale,"[misc, forsale]","[misc, forsale]","[misc, forsale]","[misc, forsale]",misc forsale
7,rec.autos,rec autos,"[rec, autos]","[rec, autos]","[rec, autos]","[rec, autos]",rec autos
8,rec.motorcycles,rec motorcycles,"[rec, motorcycles]","[rec, motorcycles]","[rec, motorcycles]","[rec, motorcycles]",rec motorcycles
9,rec.sport.baseball,rec sport baseball,"[rec, sport, baseball]","[rec, sport, baseball]","[rec, sport, baseball]","[rec, sport, baseball]",rec sport baseball


### **One-Hot Encoding**

In [6]:
text_list = df['text_to_string'].tolist()

def get_tokens(text):
    tokens = text.split()
    return [w.lower() for w in tokens if w.isalpha()]

def optimized_tokenize(texts):
    unique_words = set()
    
    for text in texts:
        tokens = get_tokens(text)
        unique_words.update(tokens)
    
    return sorted(list(unique_words))

# Aplicando a função otimizada
V_optimized = optimized_tokenize(text_list)
print(f"V_optimized has {len(V_optimized)} words: {V_optimized}")

V_optimized has 33 words: ['alt', 'atheism', 'autos', 'baseball', 'christian', 'comp', 'crypt', 'electronics', 'forsale', 'graphics', 'guns', 'hardware', 'hockey', 'ibm', 'mac', 'med', 'mideast', 'misc', 'motorcycles', 'mswindows', 'os', 'pc', 'politics', 'rec', 'religion', 'sci', 'soc', 'space', 'sport', 'sys', 'talk', 'windows', 'x']


In [7]:
# Criando um dicionário para mapear palavras a índices
word_to_index = {word: i for i, word in enumerate(V_optimized)}

# Inicializando uma lista para conter todos os vetores One-Hot Encoding
one_hot_vectors = []

for text in text_list:
    words = get_tokens(text)
    
    # Inicializando o vetor One-Hot como um array de zeros
    bag_vector = np.zeros(len(V_optimized))
    
    for w in words:
        if w in word_to_index:
            bag_vector[word_to_index[w]] = 1
                
    print(f"{text} = {np.array(bag_vector)}")
    one_hot_vectors.append(bag_vector)
    
# Salvando os vetores One-Hot em um arquivo
with open('20News_01.txt', 'w') as f:
    for vector in one_hot_vectors:
        vector_str = ' '.join(map(str, vector.astype(int)))
        f.write(vector_str + "\n")

alt atheism = [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
comp graphics = [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
comp os mswindows misc = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
comp sys ibm pc hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
comp sys mac hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
comp windows x = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 1.]
misc forsale = [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
rec autos = [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
rec motorcycles = [0. 0. 0. 0. 0. 

### **Count Vectors**

In [8]:
text_list = df['text_to_string'].tolist()

def pre_process_corpus(corpus):
    new_corpus = [doc.lower() for doc in corpus]
    regex = r"(?<!\d)[\!\?.,;:-](?!\d)"
    return [re.sub(regex, "", doc, 0) for doc in new_corpus]

corpus = pre_process_corpus(text_list)
print(corpus)

['alt atheism', 'comp graphics', 'comp os mswindows misc', 'comp sys ibm pc hardware', 'comp sys mac hardware', 'comp windows x', 'misc forsale', 'rec autos', 'rec motorcycles', 'rec sport baseball', 'rec sport hockey', 'sci crypt', 'sci electronics', 'sci med', 'sci space', 'soc religion christian', 'talk politics guns', 'talk politics mideast', 'talk politics misc', 'talk religion misc']


In [9]:
vectorizer = CountVectorizer()
doc_term_matriz = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()

count_df = pd.DataFrame(doc_term_matriz.A, columns=terms)
print(count_df.to_string())

# Salvando os vetores One-Hot em um arquivo
with open('20News_02.txt', 'w') as f:
    df_string = count_df.to_string(header=False, index=False)
    f.write(df_string)

    alt  atheism  autos  baseball  christian  comp  crypt  electronics  forsale  graphics  guns  hardware  hockey  ibm  mac  med  mideast  misc  motorcycles  mswindows  os  pc  politics  rec  religion  sci  soc  space  sport  sys  talk  windows
0     1        1      0         0          0     0      0            0        0         0     0         0       0    0    0    0        0     0            0          0   0   0         0    0         0    0    0      0      0    0     0        0
1     0        0      0         0          0     1      0            0        0         1     0         0       0    0    0    0        0     0            0          0   0   0         0    0         0    0    0      0      0    0     0        0
2     0        0      0         0          0     1      0            0        0         0     0         0       0    0    0    0        0     1            0          1   1   0         0    0         0    0    0      0      0    0     0        0
3     0        0    

### **TF-IDF**

In [10]:
text_list = df['text_to_string'].tolist()

corpus = pre_process_corpus(text_list)

vectorizer = CountVectorizer()
doc_term_matriz = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()

transformer = TfidfTransformer()
tf_idf_matrix = transformer.fit_transform(doc_term_matriz)

tfidf_df = pd.DataFrame(tf_idf_matrix.A, columns=terms)

print(tfidf_df.to_string())

# Salvando os vetores One-Hot em um arquivo
with open('20News_03.txt', 'w') as f:
    df_string = tfidf_df.to_string(header=False, index=False)
    f.write(df_string)

         alt   atheism     autos  baseball  christian      comp     crypt  electronics   forsale  graphics      guns  hardware    hockey       ibm       mac       med   mideast      misc  motorcycles  mswindows        os        pc  politics       rec  religion       sci       soc     space     sport       sys      talk   windows
0   0.707107  0.707107  0.000000  0.000000   0.000000  0.000000  0.000000     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000     0.000000   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
1   0.000000  0.000000  0.000000  0.000000   0.000000  0.557870  0.000000     0.000000  0.000000  0.829928  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000     0.000000   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2   0.0

### **N-Grams (2 grams)**

In [11]:
text_list = df['text_to_string'].tolist()

def pre_process_corpus(corpus):
    new_corpus = [doc.lower() for doc in corpus]
    regex = r"(?<!\d)[\!\?.,;:-](?!\d)"
    return [re.sub(regex, "", doc, 0) for doc in new_corpus]

corpus = pre_process_corpus(text_list)

vectorizer = CountVectorizer(ngram_range=(2, 2))
doc_term_matriz = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()

ngram_df = pd.DataFrame(doc_term_matriz.A, columns=terms)
print(ngram_df.to_string())

# Salvando os vetores One-Hot em um arquivo
with open('20News_04.txt', 'w') as f:
    df_string = ngram_df.to_string(header=False, index=False)
    f.write(df_string)

    alt atheism  comp graphics  comp os  comp sys  comp windows  ibm pc  mac hardware  misc forsale  mswindows misc  os mswindows  pc hardware  politics guns  politics mideast  politics misc  rec autos  rec motorcycles  rec sport  religion christian  religion misc  sci crypt  sci electronics  sci med  sci space  soc religion  sport baseball  sport hockey  sys ibm  sys mac  talk politics  talk religion
0             1              0        0         0             0       0             0             0               0             0            0              0                 0              0          0                0          0                   0              0          0                0        0          0             0               0             0        0        0              0              0
1             0              1        0         0             0       0             0             0               0             0            0              0                 0              0

### **Co-occurrence Vectors (Context Window = 1)**

In [12]:
text_list = df['text_to_string'].tolist()

corpus = pre_process_corpus(text_list)

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    
    for text in sentences:
        text = text.lower().split() # preprocessing, use tokenizer instead
        
        for i in range(len(text)):
            token = text[i]
            vocab.add(token) # add to vocab
            next_token = text[i + 1: i + 1 + window_size]
            
            for t in next_token:
                key = tuple(sorted([t, token]))
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab)
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16), index=vocab, columns=vocab)
    
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
        
    return df

df_cooccurrence = co_occurrence(corpus, 3)
print(df_cooccurrence.to_string())


# Salvando os vetores One-Hot em um arquivo
with open('20News_05.txt', 'w') as f:
    df_string = df_cooccurrence.to_string(header=False, index=False)
    f.write(df_string)

             alt  atheism  autos  baseball  christian  comp  crypt  electronics  forsale  graphics  guns  hardware  hockey  ibm  mac  med  mideast  misc  motorcycles  mswindows  os  pc  politics  rec  religion  sci  soc  space  sport  sys  talk  windows  x
alt            0        1      0         0          0     0      0            0        0         0     0         0       0    0    0    0        0     0            0          0   0   0         0    0         0    0    0      0      0    0     0        0  0
atheism        1        0      0         0          0     0      0            0        0         0     0         0       0    0    0    0        0     0            0          0   0   0         0    0         0    0    0      0      0    0     0        0  0
autos          0        0      0         0          0     0      0            0        0         0     0         0       0    0    0    0        0     0            0          0   0   0         0    1         0    0    0      0   

In [17]:
df['text_to_string']

0                  alt atheism
1                comp graphics
2       comp os mswindows misc
3     comp sys ibm pc hardware
4        comp sys mac hardware
5               comp windows x
6                 misc forsale
7                    rec autos
8              rec motorcycles
9           rec sport baseball
10            rec sport hockey
11                   sci crypt
12             sci electronics
13                     sci med
14                   sci space
15      soc religion christian
16          talk politics guns
17       talk politics mideast
18          talk politics misc
19          talk religion misc
Name: text_to_string, dtype: object

In [19]:
corpus

['alt atheism',
 'comp graphics',
 'comp os mswindows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

### **Word2Vec**

In [24]:
corpus

['alt atheism',
 'comp graphics',
 'comp os mswindows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

In [25]:
df

Unnamed: 0,text,text_normalized,text_tokenized,text_stopwords,text_lemmatized,text_stemmed,text_to_string
0,alt.atheism,alt atheism,"[alt, atheism]","[alt, atheism]","[alt, atheism]","[alt, atheism]",alt atheism
1,comp.graphics,comp graphics,"[comp, graphics]","[comp, graphics]","[comp, graphics]","[comp, graphics]",comp graphics
2,comp.os.ms-windows.misc,comp os mswindows misc,"[comp, os, mswindows, misc]","[comp, os, mswindows, misc]","[comp, os, mswindows, misc]","[comp, os, mswindows, misc]",comp os mswindows misc
3,comp.sys.ibm.pc.hardware,comp sys ibm pc hardware,"[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]","[comp, sys, ibm, pc, hardware]",comp sys ibm pc hardware
4,comp.sys.mac.hardware,comp sys mac hardware,"[comp, sys, mac, hardware]","[comp, sys, mac, hardware]","[comp, sys, mac, hardware]","[comp, sys, mac, hardware]",comp sys mac hardware
5,comp.windows.x,comp windows x,"[comp, windows, x]","[comp, windows, x]","[comp, windows, x]","[comp, windows, x]",comp windows x
6,misc.forsale,misc forsale,"[misc, forsale]","[misc, forsale]","[misc, forsale]","[misc, forsale]",misc forsale
7,rec.autos,rec autos,"[rec, autos]","[rec, autos]","[rec, autos]","[rec, autos]",rec autos
8,rec.motorcycles,rec motorcycles,"[rec, motorcycles]","[rec, motorcycles]","[rec, motorcycles]","[rec, motorcycles]",rec motorcycles
9,rec.sport.baseball,rec sport baseball,"[rec, sport, baseball]","[rec, sport, baseball]","[rec, sport, baseball]","[rec, sport, baseball]",rec sport baseball


In [13]:
text_list = df['text_to_string'].tolist()

corpus = pre_process_corpus(text_list)

nlp = spacy.load('en_core_web_sm') # Modelo pre treinado, large language model

bag_vector = [nlp(sentence).vector for sentence in corpus]

print(bag_vector)

# Salvando os vetores One-Hot em um arquivo
with open('20News_06.txt', 'w') as f:
    for vector in bag_vector:
        vector_string = ' '.join(map(str, vector))
        f.write(vector_string + '\n')

[array([-0.7954582 , -0.98655176,  0.0515427 ,  0.13508093, -0.15959325,
        0.2586966 ,  1.200045  ,  0.76451427,  0.03854417,  0.14450514,
        0.04850918, -0.17817537, -0.25338086, -0.19959874, -0.30313468,
       -0.12096608, -0.5854509 , -0.79546654,  0.47014832,  0.57572126,
        0.10127136,  1.0037248 , -1.1172953 , -0.60414684,  0.37199104,
        0.3180685 ,  0.07613146,  0.8745186 , -0.35697716,  0.22044638,
        0.3478574 , -0.01097913,  0.04112068,  0.05879265, -0.07342099,
       -0.49622858, -0.52929765, -0.40501   ,  0.3220029 , -0.8892287 ,
       -0.8088497 ,  0.15526013, -0.44938636,  0.62076783, -0.5475764 ,
       -0.5797746 ,  0.9262902 , -0.07025288, -0.11892524,  0.18795982,
       -0.25364587,  0.86911726, -0.41536444, -0.0911693 , -0.09557682,
        0.63764817,  0.9634266 ,  0.14100564, -0.2101104 ,  0.06945258,
       -1.0214891 , -0.42115688, -0.4206717 ,  0.32856828,  0.44711667,
       -0.35674864, -0.01652592,  0.12531166, -0.31684184, -0.4

In [23]:
teste = np.array(bag_vector)
teste.shape

(20, 96)