In [49]:
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
from google.colab import drive

import re  # Data Preprocessing

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

!pip install --upgrade gensim
from gensim.parsing.preprocessing import remove_stopwords
import gensim.downloader

pre_ft_vectors = gensim.downloader.load("glove-wiki-gigaword-100")

# Import the appropriate vectorizers (CountVect., and TF-IDF Vect.)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer as cvect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as tfvect
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold as kfold

# Import spacy for the lemmatisation process
import spacy
nlp = spacy.load('en_core_web_sm')

from wordcloud import WordCloud  # Data Visualization

import warnings
warnings.filterwarnings('ignore')  # Ignore warnings


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




In [51]:
true_df=pd.read_csv('/content/sample_data/Fake.csv')
fake_df=pd.read_csv('/content/sample_data/True.csv')


In [53]:
p = 0.2  # Keep 20% of the data.
true_df = true_df.head(int(p * true_df.shape[0]))
fake_df = fake_df.head(int(p * fake_df.shape[0]))

In [54]:

def clean_string(sent):
    sent = sent.lower()
    sent = re.sub('\n|\r|\t', '', sent)   # Remove whitespace chars
    sent = re.sub(r'[^\w\s]+', '', sent)  # Remove punctuation
    return sent

def preprocess(df):
    df.dropna(subset = ['title', 'text'], inplace = True)  # Remove rows with missing values in either title or text
    vfunc = np.vectorize(clean_string)    # Speed up string clean-up using vectorization
    df['title'] = vfunc(df['title'])
    df['text'] = vfunc(df['text'])
    return df


In [55]:

true_df = preprocess(true_df)
fake_df = preprocess(fake_df)

In [67]:
def lemmatize(string):
  lem = [token.lemma_ for token in nlp(string, disable=["parser", "ner"]) if token.lemma_ != '-PRON-']
  string = " ".join(lem)
  return string

# Lemmatise all of the titles and descriptions
def lemmatize_title_text(df):
    df_lem = df.copy()
    vfunc = np.vectorize(lemmatize)    # Speed up string clean-up using vectorization
    df_lem['text']  = vfunc(df_lem['text'])
    df_lem['title'] = vfunc(df_lem['title'])
    return df_lem

# Extract descriptions and titles and merge them into individual strings
def extract_title_text(df):
    texts = " ".join(df['text'].to_list()).strip()
    titles = " ".join(df['title'].to_list()).strip()
    return (titles, texts)

def lemmatize_and_extract(df):
    lem_df = lemmatize_title_text(df)
    titles_texts = extract_title_text(lem_df)
    return lem_df, titles_texts

In [68]:
true_df_lem, true_tt = lemmatize_and_extract(true_df)
fake_df_lem, fake_tt = lemmatize_and_extract(fake_df)

In [69]:
def merge_title_text(df):
    content = zip( df['title'].to_list(), df['text'].to_list() )
    df['content'] = [ str(title + text) for title, text in content ]

# Create "content" field, add "label" field and drop every other pre-existing field.
def prepare_df(df, label):
    df['label'] = [ label for i in range(df.shape[0]) ]  # Add label
    merge_title_text(df)
    df.drop(columns = ['title', 'text', 'subject', 'date'], inplace=True)

In [70]:
prepare_df(true_df_lem, 1)
prepare_df(fake_df_lem, 0)


In [73]:
combined = pd.concat([true_df_lem, fake_df_lem])
train_df, test_df = train_test_split(combined, train_size=0.5, random_state=420)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Store dataframes in .csv format.
test_df.to_csv (path_or_buf='test.csv',  columns=test_df.columns)
train_df.to_csv(path_or_buf='train.csv', columns=train_df.columns)

In [71]:
def get_representation(vectorizer, train_df, test_df):
    # Collect labels
    y_train = train_df['label']
    y_test = test_df['label']
    # Vectorize training and testing set.
    # We use fit_transform() on the training set, in order to learn
    # the parameters of scaling on the training set and in the same
    # time we scale the train data. We only use transform() on
    # the testing set, because we use the scaling paramaters learned
    # on the train data to scale the test data.
    x_train = vectorizer.fit_transform(train_df['content'])
    x_test = vectorizer.transform(test_df['content'])
    return x_train, y_train, x_test, y_test

In [74]:
cv_rep = get_representation(CountVectorizer(), train_df, test_df)
tf_rep = get_representation(TfidfVectorizer(), train_df, test_df)

In [75]:
def __classify(method, x_train, y_train, x_test, y_test, wmean = False):
    # Make a pipeline consisting of a scaler and the classification method
    pipe = make_pipeline(StandardScaler(with_mean=wmean), method)
    # Apply scaling on training data
    pipe.fit(x_train, y_train)
    # Apply scaling on testing data, without leaking training data.
    pipe.score(x_test, y_test)

    y_pred = pipe.predict(x_test)
    # Print the report
    print(classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred);
    print('Accuracy is {:.2f} %'.format(accuracy*100))

# Applies a classification method to the given data and prints
# the classification report and the accuracy of the classifier.
# Receives either a CountVectorizer() representation or
# a TfidfVectorizer() representation.
def classify(method, cv_rep, tf_rep, tfid = False, toarr = False):
    # Extract the sets
    x_train, y_train, x_test, y_test = tf_rep if tfid else cv_rep
    # Convert to array if needed
    if toarr: x_train, x_test = x_train.toarray(), x_test.toarray()

    __classify(method, x_train, y_train, x_test, y_test, False)

In [76]:
def sim_logistic_regression(cv_rep, tf_rep, tfid = False):
    classify(LogisticRegression(), cv_rep, tf_rep, tfid, False)

In [77]:
sim_logistic_regression(cv_rep, tf_rep, False)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      2073
           1       0.99      0.96      0.97      2417

    accuracy                           0.97      4490
   macro avg       0.97      0.97      0.97      4490
weighted avg       0.97      0.97      0.97      4490

Accuracy is 97.31 %


In [78]:
sim_logistic_regression(cv_rep, tf_rep, True)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2073
           1       1.00      0.97      0.98      2417

    accuracy                           0.98      4490
   macro avg       0.98      0.98      0.98      4490
weighted avg       0.98      0.98      0.98      4490

Accuracy is 98.35 %
