In [71]:
import pandas as pd
import nltk
import string

# Baixa os recursos necessários do NLTK
nltk.download('punkt')        # Tokenizador pré-treinado
nltk.download('stopwords')    # Lista de stopwords (palavras irrelevantes) em vários idiomas
nltk.download('wordnet')      # Base de dados semântica usada na lematização (em inglês)

# Importa componentes do NLTK
from nltk.corpus import stopwords  # Lista de palavras irrelevantes para remoção
from nltk.stem import SnowballStemmer, RSLPStemmer, PorterStemmer, WordNetLemmatizer  # Stemmers e lematizador
from nltk.tokenize import TweetTokenizer  # Tokenizador ideal para textos informais (como tweets, emojis, hashtags)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rbeat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rbeat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rbeat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
df = pd.read_csv('database/aa_dataset-tickets-multi-lang-5-2-50-version.csv')

In [73]:
df.head(5)

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,Technical Support,high,de,51,Security,Outage,Disruption,Data Breach,,,,
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51,Account,Disruption,Outage,IT,Tech Support,,,
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,Returns and Exchanges,medium,en,51,Product,Feature,Tech Support,,,,,
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,Billing and Payments,low,en,51,Billing,Payment,Account,Documentation,Feedback,,,
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,Sales and Pre-Sales,medium,en,51,Product,Feature,Feedback,Tech Support,,,,


In [74]:
# Verificando quais idiomas estão os emails 
print("Idiomas: " + ', '.join(df['language'].unique().astype(str)))
# (DE- Alemão, EN- Inglês)

# Verificando quais categorias existem
print("Categorias: " + ", " .join(df['type'].unique().astype(str)))

Idiomas: de, en
Categorias: Incident, Request, Problem, Change


In [75]:
# Filtrando os emaisl apenas na lingua inglesa
english_df = df[df['language'] == "en"]

# Removendo colunas não utilizadas
english_df = english_df.drop(columns= ["answer", "queue", "priority", "language", "version", "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8"])

In [76]:
english_df

Unnamed: 0,subject,body,type
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Incident
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Request
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Request
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Problem
5,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Request
...,...,...,...
28578,Problem with Billing Adjustment,An unexpected billing discrepancy has been not...,Incident
28580,Urgent: Incident Involving Data Breach in Medi...,"A data breach has occurred, which might be rel...",Problem
28582,Performance Problem with Data Analytics Tool,The data analytics tool experiences sluggish p...,Incident
28585,Update Request for SaaS Platform Integration F...,Requesting an update on the integration featur...,Change


# Pré-processamento de texto

In [77]:
# Tokenizador e ferramentas
tokenizador = TweetTokenizer()
stemmer_en = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords_en = set(stopwords.words('english'))

def pre_processamento(texto_en):
    # Garantir que o texto seja string
    texto_en = str(texto_en)

    # Tokenização
    tokens_en = tokenizador.tokenize(texto_en)

    # Tokens sem pontuação
    tokens_en_sem_pontuacao = [t for t in tokens_en if t not in string.punctuation]

    # Stopwords
    tokens_sem_stopwords_en = [t for t in tokens_en_sem_pontuacao if t.lower() not in stopwords_en]

    # Stemming
    stems_en = [stemmer_en.stem(t) for t in tokens_sem_stopwords_en]

    # Lematização
    lemmas_en = [lemmatizer.lemmatize(t.lower()) for t in tokens_sem_stopwords_en]

    return lemmas_en  # ou tokens_sem_stopwords_en, ou stems_en, conforme o que quiser usar

In [78]:
english_df['body_processado'] = english_df['body'].apply(pre_processamento)

In [79]:
english_df

Unnamed: 0,subject,body,type,body_processado
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Incident,"[dear, customer, support, team, n, ni, writing..."
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Request,"[dear, customer, support, team, n, ni, hope, m..."
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Request,"[dear, customer, support, team, n, ni, hope, m..."
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Problem,"[dear, support, team, n, ni, hope, message, re..."
5,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Request,"[dear, customer, support, n, ni, hope, message..."
...,...,...,...,...
28578,Problem with Billing Adjustment,An unexpected billing discrepancy has been not...,Incident,"[unexpected, billing, discrepancy, noticed, in..."
28580,Urgent: Incident Involving Data Breach in Medi...,"A data breach has occurred, which might be rel...",Problem,"[data, breach, occurred, might, related, outda..."
28582,Performance Problem with Data Analytics Tool,The data analytics tool experiences sluggish p...,Incident,"[data, analytics, tool, experience, sluggish, ..."
28585,Update Request for SaaS Platform Integration F...,Requesting an update on the integration featur...,Change,"[requesting, update, integration, feature, saa..."
