In [None]:
# 1. Install & import
!pip install sentence-transformers scikit-learn nltk
!pip install xgboost
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
# 2. Download NLTK data
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# 3. Load your dataset
df = pd.read_csv("../WELFake_Dataset.csv")  # ← replace with your path

In [None]:

# 4. Preprocessing function
def clean_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = s.lower().strip()
    s = re.sub(r"http\S+|www\.\S+", "", s)      # remove URLs
    s = re.sub(r"[^a-z0-9\s]", "", s)           # remove punctuation
    tokens = [w for w in s.split() if w not in STOPWORDS and len(w)>2]
    return " ".join(tokens)

In [None]:

# 5. Apply cleaning & combine title + text
df['title_clean'] = df['title'].apply(clean_text)
df['text_clean']  = df['text'].apply(clean_text)
df['combined']   = (df['title_clean'] + " " + df['text_clean']).str.strip()

In [None]:
# 6. Prepare features & labels
X = df['combined'].tolist()
y = df['label'].values

In [None]:
# 7. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# 8. Load the all-MiniLM-L6-v2 model
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# 9. Encode to get embeddings
train_emb = model.encode(X_train, show_progress_bar=True, batch_size=64)
test_emb  = model.encode(X_test,  show_progress_bar=True, batch_size=64)

Batches:   0%|          | 0/902 [00:00<?, ?it/s]

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=200,            # number of trees
    max_depth=6,                 # tree depth
    learning_rate=0.1,           # shrinkage
    use_label_encoder=False,     # suppress warning
    eval_metric='logloss'        # for binary classification
)
xgb_clf.fit(train_emb, y_train)

In [None]:
y_pred = xgb_clf.predict(test_emb)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
xgb_clf.save_model("model/xgb_model.json")
import joblib
joblib.dump(model, "model/vectorizer.pkl")