In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

A sample of sentence embedding, the model embed is pre-trained and loaded above and a sample of how it converts sentences to a number is displayed below. This model takes into account the context, importance, and semantics when reaching this number.

In [None]:
import numpy as np
import pandas as pd
#Define a list of sentences to be embedded
sentences = "this is a sentence"

def calculate_avg_embedding(sentence):
  # Generate embeddings for the sentences
  embeddings = embed([sentence])
  averaged_embeddings = np.mean(embeddings, axis=1)
  return float(averaged_embeddings)


print(calculate_avg_embedding(sentences))


-0.002602767664939165


<h1> <strong> Dataset Preparation </strong> </h1>

ID - Unique Article Identifier <br>
TITLE - Article Title <br>
AUTHOR - Author of Article <br>
TEXT - Content <br>
LABEL - 0 (Real) 1 (Fake)

In [None]:
#Random Forest Built on Financial dataset
import numpy as np
import pandas as pd

#Prepare a financial dataset, with shuffled real and fake news

fake_news = pd.read_csv("/content/fake_General_2.csv", error_bad_lines=False )
fake_news = fake_news.drop(columns=['Unnamed: 0'])
real_news = pd.read_csv("/content/true_general.csv", error_bad_lines=False)
real_news = real_news.drop(columns=['Unnamed: 0'])
real_news.head()

#combine datasets
combined_news = pd.concat([fake_news, real_news], ignore_index=True)

print(combined_news.shape)
#drop rows with missing values
combined_news = combined_news.dropna()
print(combined_news.shape) #lost 2000 rows :C


#shuffle them
combined_news = combined_news.sample(frac=1, random_state=42).reset_index(drop=True)



combined_news.head()



  fake_news = pd.read_csv("/content/fake_General_2.csv", error_bad_lines=False )


  real_news = pd.read_csv("/content/true_general.csv", error_bad_lines=False)


(20800, 5)
(18285, 5)


Unnamed: 0,id,title,author,text,label
0,4945,Senate Committee to Question Jared Kushner Ove...,"Jo Becker, Matthew Rosenberg and Maggie Haberman",Senate investigators plan to question Jared Ku...,0
1,16178,Filmmaker Maria Ivanova: People in Damascus ‘h...,"RIA Novosti, Pavel Gaikov","movies , syria , RBTH Daily , refugees Russian...",1
2,10389,Mexico’s Potential Weapons if Trump Declares W...,Eduardo Porter,How could Mexico inflict the most damage on th...,0
3,9444,’No Compromise’: Islamic State Supporters Cele...,Allum Bokhari,Islamic State supporters are celebrating the r...,0
4,13284,"A Debate Stage in Virginia, With Racial ‘Scar ...",Jonathan Martin,"FARMVILLE, Va. — When Tim Kaine joyously pr...",0


<h1> <strong> DATA CLEANING

In [None]:
#Apply some nlp techniques to help make analysis easier for model
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

#Lowercasing
combined_news['text'] = combined_news['text'].str.lower()
combined_news['author'] = combined_news['author'].str.lower()
combined_news['title'] = combined_news['title'].str.lower()

#Stop Word removal
stop_words = set(stopwords.words('english'))
combined_news['text'] = combined_news['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
combined_news['author'] = combined_news['author'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
combined_news['title'] = combined_news['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#Perform spell check, punctuation check, terminology count, etc here


In [None]:
#Encoding values using sentence embeddings

combined_news['title'] = combined_news['title'].apply(lambda x: calculate_avg_embedding(x))
combined_news['author'] = combined_news['author'].apply(lambda x: calculate_avg_embedding(x))
combined_news['text'] = combined_news['text'].apply(lambda x: calculate_avg_embedding(x))

print(combined_news)


          id     title    author      text  label
0       4945 -0.000793  0.003280 -0.000761      0
1      16178  0.000029  0.000720  0.001664      1
2      10389 -0.000444 -0.001762 -0.000246      0
3       9444  0.000523 -0.000195  0.001652      0
4      13284  0.000276 -0.000396 -0.000028      0
...      ...       ...       ...       ...    ...
18280   6768 -0.000179 -0.002732 -0.000207      0
18281   8060  0.002061  0.000041  0.000307      0
18282  14171 -0.002083  0.000301  0.001032      1
18283   2269  0.000114 -0.001671  0.000409      1
18284  15778  0.000915 -0.000382  0.001978      0

[18285 rows x 5 columns]


<h1> <strong> Dataset Splitting </strong> </h1>

70% Training data, 30% Testing Data<br>


In [None]:
from sklearn.model_selection import train_test_split

feature_cols = combined_news[['id','title', 'author', 'text']] #skipping ID as their doesn't seem to be any correlation
target_col = combined_news[['label']]

X_train, X_test, y_train, y_test = train_test_split(feature_cols, target_col, test_size=0.3, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

X_train.head()

Training set shape: (12799, 3) (12799, 1)
Testing set shape: (5486, 3) (5486, 1)


Unnamed: 0,title,author,text
3344,0.000899,0.000214,-0.000243
4301,-0.000262,0.001792,0.002134
8138,0.003519,-0.001475,0.002014
3533,-0.000241,0.000337,-0.001874
2449,-0.00192,-0.002231,-0.00075


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Let's implement a random forest algorithm that builds decision trees based on 2 feature columns at a time
feature_combinations = [('title', 'author'), ('author', 'text'), ('title', 'text')] #only using 3 decision trees

# Initialize a dictionary to store decision trees
decision_trees = {}

# Train Decision Trees
for features in feature_combinations:
    # Create a decision tree for the feature combination
    tree = DecisionTreeClassifier()
    tree.fit(X_train[list(features)], y_train)

    # Store the decision tree in the dictionary
    decision_trees[features] = tree

predictions = {}  # To store predictions from individual decision trees

for features, tree in decision_trees.items():
    predictions[features] = tree.predict(X_test[list(features)])

# Combine predictions using majority vote
final_predictions = []

for i in range(len(y_test)):
    votes = [predictions[features][i] for features in feature_combinations]
    # Take the majority vote as the final prediction
    final_predictions.append(max(set(votes), key=votes.count))

# Random Forest
rf = RandomForestClassifier()

# Train the Random Forest using the individual decision trees
rf.fit(X_train, y_train)

# Make predictions using the Random Forest
rf_predictions = rf.predict(X_test)

# Evaluate the models
print("Individual Decision Trees:")
for features, tree in decision_trees.items():
    accuracy = accuracy_score(y_test, predictions[features])
    print(f"{features} - Accuracy: {accuracy:.2f}")

print("\nRandom Forest:")
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest - Accuracy: {rf_accuracy:.2f}")




  rf.fit(X_train, y_train)


Individual Decision Trees:
('title', 'author') - Accuracy: 0.79
('author', 'text') - Accuracy: 0.83
('title', 'text') - Accuracy: 0.57

Random Forest:
Random Forest - Accuracy: 0.69


In [None]:
#Need to build more decision trees to improve accuracy, as only 3 decision trees are being built here.