# **Imports**

In [2]:
#%pip install pandas
#%pip install nltk
#%pip install ipywidgets

import pandas as pd
import os
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import ipywidgets as widgets
from IPython.display import display
from collections import Counter

# **Fazendo a leitura dos dados**

In [3]:
issues = pd.read_csv("out/llama-issues-3.csv")
comentarios = pd.read_csv("out/llama-comentarios-3.csv")

#issues = pd.read_csv("out/daggerfall-unity-issues-3.csv")
#comentarios = pd.read_csv("out/daggerfall-unity-comentarios-3.csv")

# Concatenando o Titulo e a Descrição das issues
issues['Info'] = issues['TituloIssue'] + issues['DescricaoIssue']

# Convertendo todos os tipos para String
issues['Info'] = issues['Info'].astype(str).str.lower()
comentarios['Comentario'] = comentarios['Comentario'].astype(str).str.lower()

# **Tokenizando as issues**

In [4]:
# Definindo as StopWords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))


def removerStopWords(palavras):
    return [palavra for palavra in palavras if palavra not in stopWords and palavra.isalpha()]

# Tokenizando
nltk.download('punkt_tab')
from nltk import tokenize

comentariosToK = []
for i in range (comentarios['Comentario'].size):
    comentariosToK.append(removerStopWords(tokenize.word_tokenize(comentarios.loc[i]['Comentario'])))

infoToK = []
for k in range(issues['Info'].size):
    infoToK.append(removerStopWords(tokenize.word_tokenize(issues.loc[k]['Info'])))

# Juntando os vetores
import itertools

comentariosToK = list(itertools.chain(*comentariosToK))
infoToK = list(itertools.chain(*infoToK))


[nltk_data] Downloading package stopwords to /home/zoega/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/zoega/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# **Contando os substantivos**

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag

tagsComents = pos_tag(comentariosToK)
tagsInfo = pos_tag(infoToK)

subsComents = [palavra for palavra, tag in tagsComents if tag in ['NN', 'NNS', 'NNP', 'NNPS']]
subsInfo = [palavra for palavra, tag in tagsInfo if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

contagemComents = Counter(subsComents)
contagemInfo = Counter(subsInfo)

dfComents = pd.DataFrame(contagemComents.items(), columns=["Substantivo", "Frequência"])
dfInfo = pd.DataFrame(contagemInfo.items(), columns=["Substantivo", "Frequência"])



# **Exibindo os Dataframes**

In [16]:
dfInfo = dfInfo.sort_values(by='Frequência', ascending=False)
dfInfo

Unnamed: 0,Substantivo,Frequência
8,model,770
443,line,732
37,file,534
107,error,383
110,output,206
...,...,...
1674,pick,1
1676,rest,1
1677,eval,1
1723,administrator,1


In [17]:
dfComents = dfComents.sort_values(by='Frequência', ascending=False)
dfComents

Unnamed: 0,Substantivo,Frequência
1,model,1176
9,line,921
3,file,867
233,issue,688
41,thanks,569
...,...,...
4263,misuse,1
4248,dollar,1
4249,valley,1
4250,billionaires,1
