In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score


In [None]:

train_df = pd.read_csv("cleaned_train_multi.csv")
test_df = pd.read_csv("cleaned_test_multi.csv")


X = train_df['News']
y = train_df['Label']


In [None]:

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score


In [None]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_val)

# Performance
print("Logistic Regression:")
print(classification_report(y_val, y_pred_lr))

Logistic Regression:
              precision    recall  f1-score   support

       FALSE       0.76      1.00      0.86       289
   HALF TRUE       0.00      0.00      0.00        23
MOSTLY FALSE       0.00      0.00      0.00        58
PARTLY FALSE       0.00      0.00      0.00        10

    accuracy                           0.76       380
   macro avg       0.19      0.25      0.22       380
weighted avg       0.58      0.76      0.66       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)

# Performance
print("Random Forest:")
print(classification_report(y_val, y_pred_rf))


Random Forest:
              precision    recall  f1-score   support

       FALSE       0.77      0.99      0.87       289
   HALF TRUE       0.50      0.13      0.21        23
MOSTLY FALSE       0.50      0.05      0.09        58
PARTLY FALSE       0.00      0.00      0.00        10

    accuracy                           0.77       380
   macro avg       0.44      0.29      0.29       380
weighted avg       0.70      0.77      0.69       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)

# Performance
print("Support Vector Machine (SVM):")
print(classification_report(y_val, y_pred_svm))


Support Vector Machine (SVM):
              precision    recall  f1-score   support

       FALSE       0.76      0.99      0.86       289
   HALF TRUE       0.00      0.00      0.00        23
MOSTLY FALSE       0.50      0.02      0.03        58
PARTLY FALSE       0.00      0.00      0.00        10

    accuracy                           0.76       380
   macro avg       0.32      0.25      0.22       380
weighted avg       0.66      0.76      0.66       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()


y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)


# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)

# Performance
print("XGBoost:")
print(classification_report(y_val, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



XGBoost:
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       289
           1       0.23      0.13      0.17        23
           2       0.25      0.09      0.13        58
           3       0.00      0.00      0.00        10

    accuracy                           0.72       380
   macro avg       0.31      0.28      0.28       380
weighted avg       0.64      0.72      0.67       380



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# LSTM Model
def build_lstm_model(input_dim, output_dim, input_length, num_classes):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        LSTM(128, return_sequences=False),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm_model(input_dim=10000, output_dim=128, input_length=200, num_classes=3)
lstm_model.summary()




In [None]:
# Training the model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['News'])


original_train_indices = train_df.index[np.arange(len(y_train))]
original_val_indices = train_df.index[np.arange(len(y_val))]

X_train_sequences = tokenizer.texts_to_sequences(train_df['News'][original_train_indices])
X_val_sequences = tokenizer.texts_to_sequences(train_df['News'][original_val_indices])

max_length = 200
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_length, padding='post', truncating='post')


model = lstm_model
history = model.fit(X_train_padded, y_train,
                    validation_data=(X_val_padded, y_val),
                    epochs=10,
                    batch_size=32)

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7230 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7312 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7163 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7292 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7218 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.7241 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 7/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

In [None]:
from sklearn.metrics import classification_report


y_pred = model.predict(X_val_padded)
y_pred_classes = y_pred.argmax(axis=1)

print(classification_report(y_val, y_pred_classes))

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
              precision    recall  f1-score   support

           0       0.76      1.00      0.86       289
           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00        58
           3       0.00      0.00      0.00        10

    accuracy                           0.76       380
   macro avg       0.19      0.25      0.22       380
weighted avg       0.58      0.76      0.66       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)  # Convert string labels to numerical labels
y_val = label_encoder.transform(y_val)

# BiLSTM Model
def build_bilstm_model(input_dim, output_dim, input_length, num_classes):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        Bidirectional(LSTM(128, return_sequences=False)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Assuming train_df, y_train, y_val are already defined in previous cells
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['News'])

original_train_indices = train_df.index[np.arange(len(y_train))]
original_val_indices = train_df.index[np.arange(len(y_val))]

X_train_sequences = tokenizer.texts_to_sequences(train_df['News'][original_train_indices])
X_val_sequences = tokenizer.texts_to_sequences(train_df['News'][original_val_indices])

max_length = 200
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_length, padding='post', truncating='post')


bilstm_model = build_bilstm_model(input_dim=10000, output_dim=128, input_length=200, num_classes=3)
bilstm_model.summary()

# Assigning to model before fitting
model = bilstm_model
history = model.fit(X_train_padded, y_train,
                    validation_data=(X_val_padded, y_val),
                    epochs=10,
                    batch_size=32)



Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.6996 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.7209 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7176 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7160 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7047 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7220 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 7/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

In [None]:
from sklearn.metrics import classification_report

# Use X_val_padded instead of X_val for prediction
y_pred = model.predict(X_val_padded)
y_pred_classes = y_pred.argmax(axis=1)

print(classification_report(y_val, y_pred_classes))

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
              precision    recall  f1-score   support

           0       0.76      1.00      0.86       289
           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00        58
           3       0.00      0.00      0.00        10

    accuracy                           0.76       380
   macro avg       0.19      0.25      0.22       380
weighted avg       0.58      0.76      0.66       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# BiLSTM + CNN Model
def build_bilstm_cnn_model(input_dim, output_dim, input_length, num_classes):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        Bidirectional(LSTM(128, return_sequences=True)),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

bilstm_cnn_model = build_bilstm_cnn_model(input_dim=10000, output_dim=128, input_length=200, num_classes=3)
bilstm_cnn_model.summary()




In [None]:
history = model.fit(X_train_padded, y_train,
                    validation_data=(X_val_padded, y_val),
                    epochs=10,
                    batch_size=32)
from sklearn.metrics import classification_report


y_pred = model.predict(X_val_padded)
y_pred_classes = y_pred.argmax(axis=1)

print(classification_report(y_val, y_pred_classes))

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7215 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7170 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7188 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7141 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7333 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7236 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 7/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from tensorflow.keras.layers import GRU

# BiGRU + CNN Model
def build_bigru_cnn_model(input_dim, output_dim, input_length, num_classes):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        Bidirectional(GRU(128, return_sequences=True)),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

bigru_cnn_model = build_bigru_cnn_model(input_dim=10000, output_dim=128, input_length=200, num_classes=3)
bigru_cnn_model.summary()




In [None]:
history = model.fit(X_train_padded, y_train,
                    validation_data=(X_val_padded, y_val),
                    epochs=10,
                    batch_size=32)
from sklearn.metrics import classification_report

y_pred = model.predict(X_val_padded)
y_pred_classes = y_pred.argmax(axis=1)

print(classification_report(y_val, y_pred_classes))

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.7140 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.7171 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7125 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7299 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7121 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7020 - loss: nan - val_accuracy: 0.7605 - val_loss: nan
Epoch 7/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
