In [1]:
!pip install ydata_profiling



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# IMPORTS
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, precision_score, accuracy_score, classification_report, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
#import ydata_profiling as ydp
from scipy.stats import randint
import numpy as np

In [5]:
# Due to time reasons I will work only with about 11650 data points (from total size 12005, 11650 - duplicates removed) from the raw news_dataset.csv
# (I saved the remaining 318 data point in a separate csv file for later cleaning, but I didn't manage to clean them yet)

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/news_data_half_corrected.csv')
df.head()

Unnamed: 0,title,text,subject,date,is_fake
0,""" WATCH: Hypocrite Mike Pence Calls Democratic...",This is unbelievably outrageous.Republicans ar...,News,2017-02-04,1
1,"""Ammon and Ryan Bundy Found ‘Not Guilty’ in Or...","21st Century Wire Yesterday, Judge Anna Brown ...",US_News,2016-10-29,1
2,"""WATCH: HILARIOUS Video Proves CNN Doesn’t Eve...",Watch these hilarious examples of CNN having r...,left-news,2017-04-03,1
3,"""TRUMP CHIEF OF STAFF Goes At It With Liberal ...",,"politics,",2017-01-29,1
4,"""PRICELESS! What Nancy Pelosi Just Said About ...",Nancy Pelosi is obviously geographically chall...,politics,2017-11-10,1


In [6]:
# Report
news_report = ydp.ProfileReport(df)
news_report

# 1426 articles have 'politics' in subject and 3123 articles have 'politicsNews' in subject column, that is 4549 articles all together

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:05<00:21,  5.25s/it][A
100%|██████████| 5/5 [00:11<00:00,  2.32s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Scikit-learn
Pandas
NumPy
Gensim
NLTK
SpaCy
ydata_profiling

In [6]:
# creating the new column where 'title' and 'text' are combined

df['combined'] = df['title'] + df['text']

In [7]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import download

# Ensure you have the necessary resources downloaded from NLTK
download('punkt')
download('stopwords')
download('wordnet')
download('punkt_tab') # Download the missing resource

# Initialize the Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Step 1: Normalize case
    text = text.lower()

    # Step 2: Remove numbers and special characters (keeping only alphabetic characters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)  # This removes everything except lowercase letters and spaces

    # Step 3: Tokenize and remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    # Step 4: Lemmatization (Convert to base form)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a single string
    cleaned_text = ' '.join(lemmatized_words)

    return cleaned_text

# Apply preprocessing to the 'combined' column of the DataFrame
df['combined_normalized'] = df['combined'].apply(preprocess_text)

# Display the DataFrame with the processed text
print(df[['combined', 'combined_normalized']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                                combined  \
0      " WATCH: Hypocrite Mike Pence Calls Democratic...   
1      "Ammon and Ryan Bundy Found ‘Not Guilty’ in Or...   
2      "WATCH: HILARIOUS Video Proves CNN Doesn’t Eve...   
3      "TRUMP CHIEF OF STAFF Goes At It With Liberal ...   
4      "PRICELESS! What Nancy Pelosi Just Said About ...   
...                                                  ...   
11651  "[VIDEO] DEAF Team USA Athlete SEXUALLY ASSAUL...   
11652  "100% FED UP! WITH HILLARY 2016? WE’VE GOT THE...   
11653  "BUILD THE WALL! How Terrorists Have Been Comi...   
11654  "Russian Twitter accounts promoted Brexit ahea...   
11655  "Trumped by candidate's rhetoric, Republican l...   

                                     combined_normalized  
0      watch hypocrite mike penny call democratic obs...  
1      ammon ryan bundy found guilty oregon federal c...  
2      watch hilarious video prof cnn doesnt even bot...  
3      trump chief staff go liberal hack ch

In [12]:
# saving the normalized data and the adjusted DF
df.to_json("/content/drive/MyDrive/Colab Notebooks/news_dataset_preprocessed.json", orient="records", lines=True)


In [1]:
!pip install gensim



In [8]:
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import ParameterGrid

# Word2Vec model building section, optimization of hyperparameters:

# Tokenized sentences
sentences = df['combined_normalized'].tolist()

# Define function to train Word2Vec
def train_word2vec(sentences, vector_size, window, epochs):
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=1,
        workers=4,
        sg=1,  # Use skip-gram (sg=1) or CBOW (sg=0) as desired
        epochs=epochs
    )
    return model

# Grid of hyperparameters
param_grid = {
    'vector_size': [100, 200],
    'window': [5, 10],
    'epochs': [10, 20]
}

# Mimic GridSearchCV
best_model = None
best_score = float('-inf')
best_params = {}

for params in ParameterGrid(param_grid):
    print(f"Training with params: {params}")

    model = train_word2vec(sentences, **params)

    # Evaluation: you can define your own — here we use vocabulary size
    # Replace with domain-specific metric if available
    score = len(model.wv)

    print(f"Score (vocab size): {score}")

    if score > best_score:
        best_score = score
        best_model = model
        best_params = params

# Final output
print("\nBest Parameters:")
print(best_params)
print(f"Best Score: {best_score}")

# Save model
best_model.save("best_word2vec_model.model")




Training with params: {'epochs': 10, 'vector_size': 100, 'window': 5}




Score (vocab size): 27
Training with params: {'epochs': 10, 'vector_size': 100, 'window': 10}




Score (vocab size): 27
Training with params: {'epochs': 10, 'vector_size': 200, 'window': 5}




Score (vocab size): 27
Training with params: {'epochs': 10, 'vector_size': 200, 'window': 10}




Score (vocab size): 27
Training with params: {'epochs': 20, 'vector_size': 100, 'window': 5}




Score (vocab size): 27
Training with params: {'epochs': 20, 'vector_size': 100, 'window': 10}




Score (vocab size): 27
Training with params: {'epochs': 20, 'vector_size': 200, 'window': 5}




Score (vocab size): 27
Training with params: {'epochs': 20, 'vector_size': 200, 'window': 10}
Score (vocab size): 27

Best Parameters:
{'epochs': 10, 'vector_size': 100, 'window': 5}
Best Score: 27


In [9]:
import numpy as np
# computing the average vector for the articles

# Function to compute the average vector for a list of tokens
def average_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # fallback for empty/no match

# Apply it to your DataFrame
df['avg_vector'] = df['combined_normalized'].apply(lambda tokens: average_vector(tokens, best_model))


In [10]:
from sklearn.linear_model import LogisticRegression

# 1. Prepare features (X) and labels (y)
# Assumes 'avg_vector' column contains np.ndarray and 'is_fake' is your target
X = np.stack(df['avg_vector'].values)
y = df['is_fake'].values  # 0 for real, 1 for fake (or however your labels are defined)

# 2. Split into training and testing sets (e.g., 80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Initialize and train Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# 4. Predict on test set
y_pred = clf.predict(X_test)

# 5. Print classification report
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.91      0.78      1187
           1       0.86      0.56      0.68      1145

    accuracy                           0.74      2332
   macro avg       0.77      0.73      0.73      2332
weighted avg       0.77      0.74      0.73      2332



In [11]:
df.to_json("/content/drive/MyDrive/Colab Notebooks/news_dataset_vectors.json", orient="records", lines=True)

HYPERPARAMETER OPTIMIZATION OF CLASSIFIER MODELS - LOGISTIC REGRESION, RANDOM FOREST and SUPPORT VECTOR MACHINE

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# 1. Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg_param_grid = {
    'C': [0.01, 0.1, 1, 10],      # Regularization strength
    'solver': ['liblinear', 'lbfgs']
}

log_reg_grid = GridSearchCV(log_reg, log_reg_param_grid, cv=5, scoring='precision')
log_reg_grid.fit(X_train, y_train)

best_log_reg = log_reg_grid.best_estimator_
log_reg_preds = best_log_reg.predict(X_test)

print("📘 Logistic Regression Report:")
print(classification_report(y_test, log_reg_preds))


# 2. Random Forest
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='precision')
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
rf_preds = best_rf.predict(X_test)

print("\n🌲 Random Forest Report:")
print(classification_report(y_test, rf_preds))

# 3. Support Vector Machine (SVM)

svm = SVC()
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(svm, svm_param_grid, cv=5, scoring='precision')
svm_grid.fit(X_train, y_train)

best_svm = svm_grid.best_estimator_
svm_preds = best_svm.predict(X_test)

print("\n🧠 SVM Report:")
print(classification_report(y_test, svm_preds))

# 4. Compare Precision Scores

precision_log_reg = precision_score(y_test, log_reg_preds)
precision_rf = precision_score(y_test, rf_preds)
precision_svm = precision_score(y_test, svm_preds)

print("\n🔍 Precision Comparison:")
print(f"Logistic Regression Precision: {precision_log_reg:.4f}")
print(f"Random Forest Precision:       {precision_rf:.4f}")
print(f"SVM Precision:                 {precision_svm:.4f}")

best_model_name = max(
    [('Logistic Regression', precision_log_reg),
     ('Random Forest', precision_rf),
     ('SVM', precision_svm)],
    key=lambda x: x[1]
)

print(f"\n🏆 Best Model: {best_model_name[0]} with Precision = {best_model_name[1]:.4f}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📘 Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.51      1.00      0.68      1187
           1       1.00      0.02      0.03      1145

    accuracy                           0.52      2332
   macro avg       0.76      0.51      0.36      2332
weighted avg       0.75      0.52      0.36      2332


🌲 Random Forest Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      1187
           1       0.82      0.78      0.80      1145

    accuracy                           0.81      2332
   macro avg       0.81      0.81      0.81      2332
weighted avg       0.81      0.81      0.81      2332



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


🧠 SVM Report:
              precision    recall  f1-score   support

           0       0.52      1.00      0.68      1187
           1       1.00      0.03      0.05      1145

    accuracy                           0.52      2332
   macro avg       0.76      0.51      0.36      2332
weighted avg       0.75      0.52      0.37      2332


🔍 Precision Comparison:
Logistic Regression Precision: 1.0000
Random Forest Precision:       0.8193
SVM Precision:                 1.0000

🏆 Best Model: Logistic Regression with Precision = 1.0000


In [None]:
# looks like the SVM model performs as well as Logistic regression model (less computationaly demanding)