In [3]:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import numpy as np




In [4]:


df = pd.read_csv("Preprocessado_df.csv")

df['combined'] = df['title'].fillna('') + " " + df['text'].fillna('') # combinar texto com titulo da noticia
df = df.drop(['title'],axis="columns")
df = df.drop(['text'],axis="columns")
df = df.drop(['Unnamed: 0'],axis="columns")
df



Unnamed: 0,label,combined
0,1,law enforcement high alert following threat co...
1,1,unbelievable obama attorney general say charlo...
2,0,bobby jindal raised hindu us story christian c...
3,1,satan 2 russia unvelis image terrifying new su...
4,1,time christian group sue amazon splc designati...
...,...,...
71532,0,russian steal research trump hack u democratic...
71533,1,watch giuliani demand democrat apologize trump...
71534,0,migrant refuse leave train refugee camp hungar...
71535,0,trump tussle give unpopular mexican leader muc...


In [5]:


X = df['combined']  
y = df['label']  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)


max_words = 50000  
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train) 

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

max_len = 120 
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)



In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import  Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam



model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_len)) 
model.add(LSTM(32)) 
model.add(Dropout(0.5))  
model.add(Dense(32, activation='relu'))  
model.add(Dense(1, activation='sigmoid')) 
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [7]:
# Treinar o modelo
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train_pad, y_train, 
                    epochs=3,
                    batch_size=32, 
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])


Epoch 1/3
[1m1565/1565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 28ms/step - accuracy: 0.8532 - loss: 0.3097 - val_accuracy: 0.9507 - val_loss: 0.1277
Epoch 2/3
[1m1565/1565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 29ms/step - accuracy: 0.9755 - loss: 0.0736 - val_accuracy: 0.9562 - val_loss: 0.1271
Epoch 3/3
[1m1565/1565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.9888 - loss: 0.0360 - val_accuracy: 0.9527 - val_loss: 0.1569


In [8]:
from sklearn.metrics import classification_report, f1_score
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)  
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")

[1m671/671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     10469
           1       0.96      0.96      0.96     10993

    accuracy                           0.96     21462
   macro avg       0.96      0.96      0.96     21462
weighted avg       0.96      0.96      0.96     21462

F1-Score: 0.9573859768550034


In [9]:
model.save('modelo_LSTM.keras')