In [30]:
import pandas as pd
import nltk
import string

# Baixa os recursos necessários do NLTK
nltk.download('punkt')        # Tokenizador pré-treinado
nltk.download('stopwords')    # Lista de stopwords (palavras irrelevantes) em vários idiomas
nltk.download('wordnet')      # Base de dados semântica usada na lematização (em inglês)

# Importa componentes do NLTK
from nltk.corpus import stopwords  # Lista de palavras irrelevantes para remoção
from nltk.stem import SnowballStemmer, RSLPStemmer, PorterStemmer, WordNetLemmatizer  # Stemmers e lematizador
from nltk.tokenize import TweetTokenizer  # Tokenizador ideal para textos informais (como tweets, emojis, hashtags)

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Basic Constraints of CA cert not marked critical
[nltk_data]     (_ssl.c:1028)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Basic Constraints of CA cert not marked critical
[nltk_data]     (_ssl.c:1028)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Basic Constraints of CA cert not marked critical
[nltk_data]     (_ssl.c:1028)>


In [2]:
# Carregando DF
df = pd.read_csv('database/aa_dataset-tickets-multi-lang-5-2-50-version.csv')
df.head(2)

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,Technical Support,high,de,51,Security,Outage,Disruption,Data Breach,,,,
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51,Account,Disruption,Outage,IT,Tech Support,,,


In [3]:
df.dtypes

subject     object
body        object
answer      object
type        object
queue       object
priority    object
language    object
version      int64
tag_1       object
tag_2       object
tag_3       object
tag_4       object
tag_5       object
tag_6       object
tag_7       object
tag_8       object
dtype: object

In [4]:
# Verificando idiomas do DF
print(df['language'].unique())

# Verificando categorias dos chamados
print(df['type'].unique())


['de' 'en']
['Incident' 'Request' 'Problem' 'Change']


In [11]:
# Filtrando os emaisl apenas na lingua inglesa
df_en = df[df['language'] == "en"]

# Removendo colunas não utilizadas
df_en = df_en.drop(columns= ["subject", "answer", "queue", "priority", "language", "version", "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8"])

In [12]:
# Verificando se os dados necessário estão no df
df_en.isnull().sum()

body    0
type    0
dtype: int64

# Pré-processamento do texto

### Tokenização

Dividir o texto em unidades menores que podem ser facilmente processados ​​por modelos de PNL.

In [20]:
def tokenizador(data):
    # Inicializar a classe tokenizer
    tokenizer = TweetTokenizer()
    tokenized_text = tokenizer.tokenize(data)
    
    return tokenized_text

In [21]:
df_en['tokenized_body'] = df_en['body'].apply(tokenizador)

In [22]:
df_en

Unnamed: 0,body,type,tokenized_body
1,"Dear Customer Support Team,\n\nI am writing to...",Incident,"[Dear, Customer, Support, Team, ,, \, n, \, nI..."
2,"Dear Customer Support Team,\n\nI hope this mes...",Request,"[Dear, Customer, Support, Team, ,, \, n, \, nI..."
3,"Dear Customer Support Team,\n\nI hope this mes...",Request,"[Dear, Customer, Support, Team, ,, \, n, \, nI..."
4,"Dear Support Team,\n\nI hope this message reac...",Problem,"[Dear, Support, Team, ,, \, n, \, nI, hope, th..."
5,"Dear Customer Support,\n\nI hope this message ...",Request,"[Dear, Customer, Support, ,, \, n, \, nI, hope..."
...,...,...,...
28578,An unexpected billing discrepancy has been not...,Incident,"[An, unexpected, billing, discrepancy, has, be..."
28580,"A data breach has occurred, which might be rel...",Problem,"[A, data, breach, has, occurred, ,, which, mig..."
28582,The data analytics tool experiences sluggish p...,Incident,"[The, data, analytics, tool, experiences, slug..."
28585,Requesting an update on the integration featur...,Change,"[Requesting, an, update, on, the, integration,..."


### Remoção de stopwords

São consideradas como tendo pouco valor semântico ou são insignificantes para a tarefa

Cada linguagem, tem sua lista de stop words. Como a língua dos textos que estamos pré-processando é inglês, vamos lidar com elas

In [29]:
stopwords_en = set(stopwords.words('english'))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Rebeka.Dias/nltk_data'
    - 'c:\\workspace\\Classificacao-de-chamados-de-suporte\\venv\\nltk_data'
    - 'c:\\workspace\\Classificacao-de-chamados-de-suporte\\venv\\share\\nltk_data'
    - 'c:\\workspace\\Classificacao-de-chamados-de-suporte\\venv\\lib\\nltk_data'
    - 'C:\\Users\\Rebeka.Dias\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
def stopwords():
    