# Importação de dados:

In [None]:
import os
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

### Baixar recursos do NLTK (apenas na primeira execução)

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

### Função para extrair texto de PDFs

In [None]:
def pdf_para_txt(caminho_pdf):
	with open(caminho_pdf, 'rb') as f:
		leitor = PyPDF2.PdfReader(f)
		texto = ""
		for pagina in range(len(leitor.pages)):
			texto += leitor.pages[pagina].extract_text()
	return texto

### Diretórios com os PDFs

In [None]:
diretorios = {
	'poesia': 'pdfs/poesia/',
	'prosa': 'pdfs/prosa/',
	'jornalismo': 'pdfs/jornalismo/'
}

### Função para limpar e remover stopwords

In [None]:
def limpar_texto(texto):
	stop_words = set(stopwords.words('english'))
	palavras = word_tokenize(texto.lower())
	palavras_limpa = [palavra for palavra in palavras
					if palavra.isalnum()
					and palavra not in stop_words]
	return " ".join(palavras_limpa)

### Extraindo textos e gerando classes

In [None]:
textos = []
classes = []

for classe, caminho in diretorios.items():
	for arquivo in os.listdir(caminho):
		if arquivo.endswith('.pdf'):
			texto = pdf_para_txt(os.path.join(caminho, arquivo))
			texto_limpo = limpar_texto(texto)
			textos.append(texto_limpo)
			classes.append(classe)

### Criando a matriz Bag of Words

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)
print("Matriz BoW: ", X.toarray())
print("Vocabulário: ", vectorizer.get_feature_names_out())
print("Classes: ", classes)

### Divisão dos dados em treino e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.3, random_state=42)

# Árvore de Decisão

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.tree import DecisionTreeClassifier

SPLITS = 10

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=42)

dtc = DecisionTreeClassifier(random_state=42)

param_grid = {
	'criterion': ['entropy'],
	'max_depth': [None, 20],
	'random_state': [42]
}

grid_search = GridSearchCV(dtc, param_grid, cv=skf, scoring=['f1_weighted', 'accuracy'], n_jobs=-1, refit='f1_weighted')
grid_search.fit(X_train, y_train)

print(f'Melhores parametros: {grid_search.best_params_}')
print(f'Melhor score: {grid_search.best_score_}')


In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Arvore de Decisão:')
print(f'\tAccuracia: {accuracy:0.2}')
print(f'\tF1 score: {f1:0.2}')

In [None]:
scores = cross_validate(grid_search.best_estimator_, X, classes, scoring=['accuracy', 'f1_weighted'], cv=skf, return_train_score=False)

for i in range(SPLITS):
	print(f'Fold {i+1}:')
	print(f'\tAccuracia: {scores['test_accuracy'][i]:0.2}')
	print(f'\tF1 score: {scores['test_f1_weighted'][i]:0.2}')

print(f'Media: {scores['test_accuracy'].mean():0.2}')
print(f'Desvio Padrao: {scores['test_accuracy'].std():0.2}')


# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

knc = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [5, 1],
}

grid_search = GridSearchCV(knc, param_grid, cv=skf, scoring=['f1_weighted', 'accuracy'], n_jobs=-1, refit='f1_weighted')
grid_search.fit(X_train, y_train)

print(f'Melhores parametros: {grid_search.best_params_}')
print(f'Melhor score: {grid_search.best_score_}')

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'K-Nearest Neighbors:')
print(f'\tAccuracia: {accuracy:0.2}')
print(f'\tF1 score: {f1:0.2}')

In [None]:
scores = cross_validate(grid_search.best_estimator_, X, classes, scoring=['accuracy', 'f1_weighted'], cv=skf, return_train_score=False)

for i in range(SPLITS):
	print(f'Fold {i+1}:')
	print(f'\tAccuracia: {scores['test_accuracy'][i]:0.2}')
	print(f'\tF1 score: {scores['test_f1_weighted'][i]:0.2}')

print(f'Media: {scores['test_accuracy'].mean():0.2}')
print(f'Desvio Padrao: {scores['test_accuracy'].std():0.2}')

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

param_grid = {
	'alpha': [1, 0.5],
}

grid_search = GridSearchCV(mnb, param_grid, cv=skf, scoring=['f1_weighted', 'accuracy'], n_jobs=-1, refit='f1_weighted')
grid_search.fit(X_train, y_train)

print(f'Melhores parametros: {grid_search.best_params_}')
print(f'Melhor score: {grid_search.best_score_}')

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Naive Bayes:')
print(f'\tAccuracia: {accuracy:0.2}')
print(f'\tF1 score: {f1:0.2}')

In [None]:
scores = cross_validate(grid_search.best_estimator_, X, classes, scoring=['accuracy', 'f1_weighted'], cv=skf, return_train_score=False)

for i in range(SPLITS):
	print(f'Fold {i+1}:')
	print(f'\tAccuracia: {scores['test_accuracy'][i]:0.2}')
	print(f'\tF1 score: {scores['test_f1_weighted'][i]:0.2}')

print(f'Media: {scores['test_accuracy'].mean():0.2}')
print(f'Desvio Padrao: {scores['test_accuracy'].std():0.2}')

# Regressão Logística

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

param_grid = {
	'penalty': ['l2', None],
	'random_state': [42],
}

grid_search = GridSearchCV(lr, param_grid, cv=skf, scoring=['f1_weighted', 'accuracy'], n_jobs=-1, refit='f1_weighted')
grid_search.fit(X_train, y_train)

print(f'Melhores parametros: {grid_search.best_params_}')
print(f'Melhor score: {grid_search.best_score_}')


In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Regressão Logística:')
print(f'\tAccuracia: {accuracy:0.2}')
print(f'\tF1 score: {f1:0.2}')

In [None]:
scores = cross_validate(grid_search.best_estimator_, X, classes, scoring=['accuracy', 'f1_weighted'], cv=skf, return_train_score=False)

for i in range(SPLITS):
	print(f'Fold {i+1}:')
	print(f'\tAccuracia: {scores['test_accuracy'][i]:0.2}')
	print(f'\tF1 score: {scores['test_f1_weighted'][i]:0.2}')

print(f'Media: {scores['test_accuracy'].mean():0.2}')
print(f'Desvio Padrao: {scores['test_accuracy'].std():0.2}')

# Rede Neural MLP

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

param_grid = {
	'hidden_layer_sizes': [(50, 50), (100, 100)],
	'random_state': [42],
}

grid_search = GridSearchCV(mlp, param_grid, cv=skf, scoring=['f1_weighted', 'accuracy'], n_jobs=-1, refit='f1_weighted')
grid_search.fit(X_train, y_train)

print(f'Melhores parametros: {grid_search.best_params_}')
print(f'Melhor score: {grid_search.best_score_}')

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Regressão Logística:')
print(f'\tAccuracia: {accuracy:0.2}')
print(f'\tF1 score: {f1:0.2}')

In [None]:
scores = cross_validate(grid_search.best_estimator_, X, classes, scoring=['accuracy', 'f1_weighted'], cv=skf, return_train_score=False)

for i in range(SPLITS):
	print(f'Fold {i+1}:')
	print(f'\tAccuracia: {scores['test_accuracy'][i]:0.2}')
	print(f'\tF1 score: {scores['test_f1_weighted'][i]:0.2}')

print(f'Media: {scores['test_accuracy'].mean():0.2}')
print(f'Desvio Padrao: {scores['test_accuracy'].std():0.2}')