In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import collections
tqdm.pandas()

In [3]:
test_df = pd.read_csv('data/news_sample_6.csv')

In [4]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

  df = pd.read_csv('data/995,000_row_cleaned.csv')


In [5]:
# Remove rows with ivalid values
label_map = {"fake": 1, "satire":1, "conspiracy": 1, "unreliable": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,
             "reliable": 0,  "clickbait": 0,   "political": 0}
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].value_counts())
df["type"] = df["type"].map(label_map)
print(df["type"].value_counts())



type
reliable      218564
political     194518
bias          133232
fake          104883
conspiracy     97314
rumor          56445
unreliable     35332
clickbait      27412
junksci        14040
satire         13160
hate            8779
Name: count, dtype: int64
type
1    463185
0    440494
Name: count, dtype: int64


In [6]:
## Balancing the dataset
# Separate the two classes
type_0 = df[df['type'] == 0]
type_1 = df[df['type'] == 1]

# Check class sizes
count_0 = len(type_0)
count_1 = len(type_1)

# Identify minority and majority classes
if count_0 < count_1:
    # Type 0 is minority
    min_count = count_0
    majority_class = type_1
    minority_class = type_0
    print("Type 0 is minority")
else:
    # Type 1 is minority
    min_count = count_1
    majority_class = type_0
    minority_class = type_1
    print("Type 1 is minority")

# Undersample majority class
majority_undersampled = majority_class.sample(n=min_count, random_state=42)

# Combine the undersampled majority with minority
balanced_df = pd.concat([majority_undersampled, minority_class])

# Verify the new distribution
print("\nBalanced distribution:")
print(balanced_df['type'].value_counts())

# Save the balanced dataset
balanced_df.to_csv('data/995,000_row_balanced.csv', index=False)
balanced_df.sort_values(by='scraped_at', inplace=True)


Type 0 is minority

Balanced distribution:
type
1    440494
0    440494
Name: count, dtype: int64


In [7]:
# Label all to either 1 or 0
labels = balanced_df["type"]

# Using Bag of Words
texts = balanced_df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words

X_train, X_temp, y_train, y_temp = train_test_split(
    texts, labels, 
    test_size=0.2,  
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

X_train = vectorizer.fit_transform(X_train)  
X_val = vectorizer.transform(X_val)          
X_test = vectorizer.transform(X_test)  

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)

mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)

print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.48943801859271957
LogisticRegression accuracy:  0.5105619814072805
LogisticRegression F1 score:  0.001620783069763134


In [8]:
model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)

mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)

print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))

LogisticRegression MSE:  0.48943801859271957
LogisticRegression accuracy:  0.5105619814072805
LogisticRegression F1 score:  0.001620783069763134


In [9]:
counter = collections.Counter(y_train)
print(counter)
counter = collections.Counter(y_val)
print(counter)
counter = collections.Counter(y_test)
print(counter)
counter = collections.Counter(labels)
print(counter)

Counter({1: 440456, 0: 264334})
Counter({0: 88061, 1: 38})
Counter({0: 88099})
Counter({0: 440494, 1: 440494})


In [10]:
## Test unblananaced data
# Label all to either 1 or 0
labels = df["type"]

# Using Bag of Words
texts = df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words

X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)

mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)

print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.13151779390934845
LogisticRegression accuracy:  0.8684822060906515
LogisticRegression F1 score:  0.8797539432814981


# Test if only reliable is reliabel and unreliable data

In [11]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

# Remove rows with ivalid values
label_map = {"fake": 1, "satire":1, "conspiracy": 1, "unreliable": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,  "clickbait": 1,   "political": 1,
             "reliable": 0}
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].unique())
df["type"] = df["type"].map(label_map)
print(df["type"].unique())
print(df["type"].value_counts())


  df = pd.read_csv('data/995,000_row_cleaned.csv')


['political' 'fake' 'satire' 'reliable' 'conspiracy' 'unreliable' 'bias'
 'rumor' 'clickbait' 'hate' 'junksci']
[1 0]
type
1    685115
0    218564
Name: count, dtype: int64


In [12]:
## Balancing the dataset
# Separate the two classes
type_0 = df[df['type'] == 0]
type_1 = df[df['type'] == 1]

# Check class sizes
count_0 = len(type_0)
count_1 = len(type_1)

# Identify minority and majority classes
if count_0 < count_1:
    # Type 0 is minority
    min_count = count_0
    majority_class = type_1
    minority_class = type_0
    print("Type 0 is minority")
else:
    # Type 1 is minority
    min_count = count_1
    majority_class = type_0
    minority_class = type_1
    print("Type 1 is minority")

# Undersample majority class
majority_undersampled = majority_class.sample(n=min_count, random_state=42)

# Combine the undersampled majority with minority
balanced_df = pd.concat([majority_undersampled, minority_class])


# Verify the new distribution
print("\nBalanced distribution:")
print(balanced_df['type'].value_counts())

# Save the balanced dataset
balanced_df.to_csv('data/995,000_row_balanced.csv', index=False)


Type 0 is minority

Balanced distribution:
type
1    218564
0    218564
Name: count, dtype: int64


In [13]:
## Test unblananaced data
# Label all to either 1 or 0
labels = df["type"]

# Using Bag of Words
texts = df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.057465031869688384
LogisticRegression accuracy:  0.9425349681303116
LogisticRegression F1 score:  0.962696377389393


In [14]:
## Test balanced data 

# Label all to either 1 or 0
labels = balanced_df["type"]

# Using Bag of Words
texts = balanced_df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)
model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.1273534188914053
LogisticRegression accuracy:  0.8726465811085947
LogisticRegression F1 score:  0.0


# Reliable and politcal

In [15]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

# Remove rows with ivalid values
label_map = {"fake": 1, "satire":1, "conspiracy": 1, "unreliable": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,  "clickbait": 1,
             "reliable": 0,   "political": 0}
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].unique())
df["type"] = df["type"].map(label_map)
print(df["type"].unique())
print(df["type"].value_counts())


  df = pd.read_csv('data/995,000_row_cleaned.csv')


['political' 'fake' 'satire' 'reliable' 'conspiracy' 'unreliable' 'bias'
 'rumor' 'clickbait' 'hate' 'junksci']
[0 1]
type
1    490597
0    413082
Name: count, dtype: int64


In [16]:
## Balancing the dataset
# Separate the two classes
type_0 = df[df['type'] == 0]
type_1 = df[df['type'] == 1]

# Check class sizes
count_0 = len(type_0)
count_1 = len(type_1)

# Identify minority and majority classes
if count_0 < count_1:
    # Type 0 is minority
    min_count = count_0
    majority_class = type_1
    minority_class = type_0
    print("Type 0 is minority")
else:
    # Type 1 is minority
    min_count = count_1
    majority_class = type_0
    minority_class = type_1
    print("Type 1 is minority")

# Undersample majority class
majority_undersampled = majority_class.sample(n=min_count, random_state=42)

# Combine the undersampled majority with minority
balanced_df = pd.concat([majority_undersampled, minority_class])



# Verify the new distribution
print("\nBalanced distribution:")
print(balanced_df['type'].value_counts())

# Save the balanced dataset
balanced_df.to_csv('data/995,000_row_balanced.csv', index=False)


Type 0 is minority

Balanced distribution:
type
1    413082
0    413082
Name: count, dtype: int64


In [17]:
## Test unblananaced data
# Label all to either 1 or 0
labels = df["type"]

# Using Bag of Words
texts = df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_test, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_test, y_pred))


LogisticRegression MSE:  0.12999070467422097
LogisticRegression accuracy:  0.5181037535410765
LogisticRegression F1 score:  0.5914285178165988


In [18]:
## Test balanced data 

# Label all to either 1 or 0
labels = balanced_df["type"]

# Using Bag of Words
texts = balanced_df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression accuracy:  0.7442989251476712
LogisticRegression F1 score:  0.0


# Remove unrelaible

In [19]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

# Remove rows with ivalid values
label_map = {"fake": 1, "satire":1, "conspiracy": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,  "clickbait": 1,
             "reliable": 0,   "political": 0}
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].unique())
df["type"] = df["type"].map(label_map)
print(df["type"].unique())
print(df["type"].value_counts())

  df = pd.read_csv('data/995,000_row_cleaned.csv')


['political' 'fake' 'satire' 'reliable' 'conspiracy' 'bias' 'rumor'
 'clickbait' 'hate' 'junksci']
[0 1]
type
1    455265
0    413082
Name: count, dtype: int64


In [20]:
## Balancing the dataset
# Separate the two classes
type_0 = df[df['type'] == 0]
type_1 = df[df['type'] == 1]

# Check class sizes
count_0 = len(type_0)
count_1 = len(type_1)

# Identify minority and majority classes
if count_0 < count_1:
    # Type 0 is minority
    min_count = count_0
    majority_class = type_1
    minority_class = type_0
    print("Type 0 is minority")
else:
    # Type 1 is minority
    min_count = count_1
    majority_class = type_0
    minority_class = type_1
    print("Type 1 is minority")

# Undersample majority class
majority_undersampled = majority_class.sample(n=min_count, random_state=42)

# Combine the undersampled majority with minority
balanced_df = pd.concat([majority_undersampled, minority_class])


# Verify the new distribution
print("\nBalanced distribution:")
print(balanced_df['type'].value_counts())


Type 0 is minority

Balanced distribution:
type
1    413082
0    413082
Name: count, dtype: int64


In [21]:
## Test unblananaced data
# Label all to either 1 or 0
labels = df["type"]

# Using Bag of Words
texts = df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_test, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_test, y_pred))

LogisticRegression MSE:  0.14084182645246732
LogisticRegression accuracy:  0.5013301088270858
LogisticRegression F1 score:  0.5381317063805278


In [22]:
## Test balanced data 

# Label all to either 1 or 0
labels = balanced_df["type"]

# Using Bag of Words
texts = balanced_df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)
model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.26055485620218843
LogisticRegression accuracy:  0.7394451437978116
LogisticRegression F1 score:  0.0


# Only fake reliable

In [23]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

# Remove rows with ivalid values
label_map = {"fake": 1,
             "reliable": 0}
df = df[df["type"].isin(label_map.keys())]  # Keep only rows with valid labels
print(df["type"].unique())
df["type"] = df["type"].map(label_map)
print(df["type"].unique())
print(df["type"].value_counts())

  df = pd.read_csv('data/995,000_row_cleaned.csv')


['fake' 'reliable']
[1 0]
type
0    218564
1    104883
Name: count, dtype: int64


In [24]:
## Balancing the dataset
# Separate the two classes
type_0 = df[df['type'] == 0]
type_1 = df[df['type'] == 1]

# Check class sizes
count_0 = len(type_0)
count_1 = len(type_1)

# Identify minority and majority classes
if count_0 < count_1:
    # Type 0 is minority
    min_count = count_0
    majority_class = type_1
    minority_class = type_0
    print("Type 0 is minority")
else:
    # Type 1 is minority
    min_count = count_1
    majority_class = type_0
    minority_class = type_1
    print("Type 1 is minority")

# Undersample majority class
majority_undersampled = majority_class.sample(n=min_count, random_state=42)

# Combine the undersampled majority with minority
balanced_df = pd.concat([majority_undersampled, minority_class])


# Verify the new distribution
print("\nBalanced distribution:")
print(balanced_df['type'].value_counts())


Type 1 is minority

Balanced distribution:
type
0    104883
1    104883
Name: count, dtype: int64


In [25]:
## Test unblananaced data
# Label all to either 1 or 0
labels = df["type"]

# Using Bag of Words
texts = df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)

model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_test, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_test, y_pred))

LogisticRegression MSE:  0.05527902303292626
LogisticRegression accuracy:  0.5839233266347195
LogisticRegression F1 score:  0.2974524953017331


In [26]:
## Test balanced data 

# Label all to either 1 or 0
labels = balanced_df["type"]

# Using Bag of Words
texts = balanced_df["content"]

vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X = vectorizer.fit_transform(texts)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, labels, 
    test_size=0.2, 
    shuffle=False)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    shuffle=False)
model = LogisticRegression(max_iter=1000,verbose=1)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = ((y_val-y_pred)**2).mean()
acc = accuracy_score(y_val, y_pred)
print("LogisticRegression MSE: ", mse)
print("LogisticRegression accuracy: ", acc)
print("LogisticRegression F1 score: ", f1_score(y_val, y_pred))


LogisticRegression MSE:  0.08976498069314011
LogisticRegression accuracy:  0.9102350193068599
LogisticRegression F1 score:  0.953008410072122
