In [5]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import scipy.sparse as sp
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import RandomizedSearchCV

#Leitura e carregamento do arquivo train.csv, limitando para 100.000 linhas
df = pd.read_csv('train.csv', nrows=100000)

#Remover duplicatas
df = df.drop_duplicates()

#Tratar valores nulos
df['comment_text'] = df['comment_text'].fillna('')

#Filtragem e balanceamento das categorias
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
balanced_datasets = [df[df[category] == 1] for category in categories]
df_balanced = pd.concat(balanced_datasets).drop_duplicates()

#Consolidação das categorias
df_balanced['severe_toxic'] = df_balanced[['toxic', 'severe_toxic', 'threat']].max(axis=1)
df_balanced['offensive'] = df_balanced[['obscene', 'insult', 'identity_hate']].max(axis=1)
df_balanced = df_balanced.drop(columns=['toxic', 'obscene', 'insult', 'identity_hate', 'threat'])

#Separar os comentários e os rótulos
X = df_balanced['comment_text']
y = df_balanced[['severe_toxic', 'offensive']]

In [6]:
#Engenharia de Características
df_balanced['comment_length'] = df_balanced['comment_text'].apply(len)
df_balanced['word_count'] = df_balanced['comment_text'].apply(lambda x: len(x.split()))
df_balanced['special_char_count'] = df_balanced['comment_text'].apply(lambda x: sum(not c.isalnum() for c in x))

#Vetorizar os textos com TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_vectorized = vectorizer.fit_transform(X)

#Combinar características vetorizadas com novas características
X_combined = sp.hstack((X_vectorized, df_balanced[['comment_length', 'word_count', 'special_char_count']]))

#Redução de dimensionalidade e seleção de características
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_combined.toarray())
k_best = SelectKBest(chi2, k=300)
X_kbest = k_best.fit_transform(X_combined, y)

In [7]:
#Dividir os dados em treino e teste com estratificação
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2, random_state=42, stratify=y)

#Ajuste de Hiperparâmetros usando RandomizedSearchCV
param_dist = {
    'estimator__n_estimators': [50, 100, 150],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

model = MultiOutputClassifier(RandomForestClassifier(random_state=42))
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print(f'Melhores parâmetros: {random_search.best_params_}')

#Treinar o modelo final com os melhores parâmetros
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Melhores parâmetros: {'estimator__n_estimators': 150, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 2, 'estimator__max_depth': None}


In [8]:
#Fazer previsões no conjunto de teste
y_pred = best_model.predict(X_test)

#Avaliar a performance do modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#Calcular recall para cada categoria
recall_per_class = recall_score(y_test, y_pred, average=None)

#Exibir as métricas
print(f'Acurácia: {accuracy}')
print(f'Precisão: {precision}')
print(f'Recall (Geral): {recall}')
print(f'F1: {f1}')

#Exibir o recall para cada classe
classes = ['severe_toxic', 'offensive']
for cls, rec in zip(classes, recall_per_class):
    print(f'Recall para {cls}: {rec}')

Acurácia: 0.7096615988229524
Precisão: 0.8828388782233252
Recall (Geral): 0.9182915506035283
F1: 0.9000969863322741
Recall para severe_toxic: 1.0
Recall para offensive: 0.7975460122699386
