In [1]:
### Importações
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
#!kaggle competitions download -c tweet-sentiment-extraction

In [3]:
#!unzip tweet-sentiment-extraction.zip

In [4]:
!ls

groupe4projet.ipynb  sample_submission.csv  tweet-sentiment-extraction.zip
projet3.pdf	     test.csv
README.md	     train.csv


In [5]:
### Carregamento e Preprocessamento dos Dados
# Substituir pelo caminho real do dataset
data_path = 'train.csv'
data = pd.read_csv(data_path)


In [6]:
# Exemplo de colunas esperadas: 'text', 'sentiment'
print("Exemplo de dados:")
print(data.head())

# Separar texto e rótulos
texts = data['text']
labels = data['sentiment']

# Mapear rótulos para valores numéricos
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
labels = labels.map(label_mapping)

Exemplo de dados:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [7]:
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [8]:
### Geração de Embeddings com LLM
# Usando BERT como exemplo
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Função para gerar embeddings
def generate_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            # Usar CLS token como representação do texto
            embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())
    return np.array(embeddings)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
# Gerar embeddings para treino e teste
print("Gerando embeddings para os dados de treino...")
X_train_embeddings = generate_embeddings(X_train)

print("Gerando embeddings para os dados de teste...")
X_test_embeddings = generate_embeddings(X_test)

Gerando embeddings para os dados de treino...


KeyboardInterrupt: 

In [None]:
### Configuração e Teste de Modelos
# Dicionário de modelos para testar
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

In [None]:
# Loop para treinar e avaliar cada modelo
results = {}
for model_name, model in models.items():
    print(f"Treinando o modelo: {model_name}")
    model.fit(X_train_embeddings, y_train)

    # Avaliação
    y_pred = model.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[model_name] = {'accuracy': accuracy, 'f1_score': f1}

    print(f"Resultados para {model_name}:")
    print(f"Acurácia: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


In [None]:
### Comparação de Resultados
results_df = pd.DataFrame(results).T
print("\nResumo dos Resultados:")
print(results_df)