In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns




# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Model
from sklearn.linear_model import PassiveAggressiveClassifier

# Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Adjust the path as needed
file_path = '/content/drive/My Drive/train.csv'
df = pd.read_csv(file_path)

print(df.head())


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [None]:
import pandas as pd


# Optionally assign column names manually if you know them
df.columns = ['id', 'title', 'author', 'text','label']  # Replace as needed

print(df.head())

df.isnull().sum()
df.dropna(inplace=True)  # Only if you want to remove missing rows


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [None]:
print(df.columns)


Index(['id', 'title', 'author', 'text', 'label'], dtype='object')


In [None]:
X = df['text']
y = df['label']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train data, transform test data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)


In [None]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)


In [None]:
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score * 100, 2)}%')

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 96.14%
[[2017   65]
 [  76 1499]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2082
           1       0.96      0.95      0.96      1575

    accuracy                           0.96      3657
   macro avg       0.96      0.96      0.96      3657
weighted avg       0.96      0.96      0.96      3657



In [None]:
import joblib

# Save the vectorizer and model
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(pac, 'fake_news_model.pkl')


['fake_news_model.pkl']

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(tfidf_train, y_train)
y_pred = model.predict(tfidf_test)
print(accuracy_score(y_test, y_pred))
joblib.dump(tfidf_vectorizer, 'tfidf_Logistic.pkl')
joblib.dump(pac, 'Logistic.pkl')




0.9464041564123599


['Logistic.pkl']

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(tfidf_train, y_train)
y_pred = model.predict(tfidf_test)
print(accuracy_score(y_test, y_pred))
joblib.dump(tfidf_vectorizer, 'tfidf_RFC.pkl')
joblib.dump(pac, 'RFC.pkl')


0.9135903746240087


['RFC.pkl']

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(tfidf_train, y_train)
y_pred = model.predict(tfidf_test)
print(accuracy_score(y_test, y_pred))
joblib.dump(tfidf_vectorizer, 'tfidf_XGB.pkl')
joblib.dump(pac, 'XGB.pkl')


0.9546076018594476


['XGB.pkl']

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5)
grid.fit(tfidf_train, y_train)
print(grid.best_params_)

best_model = grid.best_estimator_
y_pred = best_model.predict(tfidf_test)
print(accuracy_score(y_test, y_pred))


In [None]:
df['text'] = df['text'].str.lower()


In [None]:
df['text'] = df['text'].str.replace(r'[^\w\s]+', ' ', regex=True)


In [None]:
import pandas as pd
import re
from nltk.stem import PorterStemmer

# Load dataset
df = pd.read_csv('/content/drive/My Drive/train.csv')

# Rename columns if needed
df.columns = ['id', 'title', 'author', 'text','label']
df.dropna(subset=['text'], inplace=True)

# Initialize stemmer
stemmer = PorterStemmer()

# Define a safe, regex-based stemmer
def stem_text(text):
    if not isinstance(text, str):
        return ""
    tokens = re.findall(r'\b\w+\b', text.lower())
    return ' '.join([stemmer.stem(word) for word in tokens])

# Apply stemming
df['text'] = df['text'].apply(stem_text)

# Preview result
print(df.head())


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  hous dem aid we didn t even see comey s letter...      1  
1  ever get the feel your life circl the roundabo...      0  
2  whi the truth might get you fire octob 29 2016...      1  
3  video 15 civilian kill in singl us airstrik ha...      1  
4  print an iranian woman ha been sentenc to six ...      1  


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Example: Assuming you already have df with 'text' and 'label' columns
# df = pd.read_csv('your_dataset.csv')

# Create TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # unigrams + bigrams

X_tfidf = tfidf.fit_transform(df['text'])
y = df['label']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train Logistic Regression classifier
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9470262460871659


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
df = pd.read_csv('/content/drive/My Drive/train.csv')
df.dropna(subset=['text', 'label'], inplace=True)

# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",  # Corrected: Changed from evaluation_strategy to eval_strategy
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=False
)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
trainer.train()

# Evaluate
preds_output = trainer.predict(val_dataset)
preds = preds_output.predictions.argmax(axis=1)
print("Validation Accuracy:", accuracy_score(val_labels, preds))

# Load test set
test_df = pd.read_csv('/content/drive/My Drive/test.csv')
test_df.dropna(subset=['text'], inplace=True)

# Tokenize test data
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)
test_dataset = NewsDataset(test_encodings, [0]*len(test_df))  # dummy labels

# Predict
test_preds = trainer.predict(test_dataset).predictions.argmax(axis=1)
test_df['predicted_label'] = test_preds
test_df['prediction_meaning'] = test_df['predicted_label'].map({0: 'Fake', 1: 'Real'})

# Save and preview
test_df.to_csv('test_with_predictions.csv', index=False)
print(test_df[['text', 'predicted_label', 'prediction_meaning']].head())

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmuhammadalyanbukhari[0m ([33mmuhammadalyanbukhari-beaconhouse-national-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding, Trainer
from torch.utils.data import Dataset

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Load the saved model and tokenizer
model_path = "/content/drive/My Drive/saved_model"
model = BertForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = BertTokenizer.from_pretrained(model_path)

# ✅ Load test set
test_df = pd.read_csv('/content/drive/My Drive/test.csv')
test_df.dropna(subset=['text'], inplace=True)

# ✅ Tokenize test data
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)

# ✅ Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# ✅ Prepare dataset and trainer
test_dataset = NewsDataset(test_encodings)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Predict
preds = trainer.predict(test_dataset).predictions.argmax(axis=1)
test_df['predicted_label'] = preds
test_df['prediction_meaning'] = test_df['predicted_label'].map({0: 'Fake', 1: 'Real'})

# ✅ Save and preview results
test_df.to_csv("test_with_predictions_from_loaded_model.csv", index=False)
print(test_df[['text', 'predicted_label', 'prediction_meaning']].head())


In [None]:
# Define a path in your Drive to save the model
save_path = "/content/drive/My Drive/saved_model"

# Save model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")
