In [1]:
import pandas as pd
import numpy as np

from cleantext import clean
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt 
import seaborn as sns

import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB


from wordcloud import WordCloud

import joblib 




2023-08-18 19:54:36.966800: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Reading file

In [2]:
df = pd.read_csv("news_cleaned_2018_02_13.csv", nrows = 50000, skiprows=lambda x: x in range(1, 400000),usecols=["type", "content"])

In [3]:
def clean_file(df:pd.DataFrame) -> (pd.DataFrame, str):
    """
        Takes a TextFileReader with dataframes, and cleans the content. 
        
        Returns the new TextFileReader with cleaned content, and a concatenated string of all the contents. 
    """
    concat_text = ""
    i = 0
    for text in df["content"]:
        if type(text) == str:
            date_reg = re.compile("\d{0,4}-\d{0,2}-\d{0,2}[^,]+") # Cleaning the Dates of the text
            date_subbed = date_reg.sub("<DAT>", text)
            date_reg2 = re.compile("/^(?:\d{4})-(?:\d{2})-(?:\d{2})T(?:\d{2}):(?:\d{2}):(?:\d{2}(?:\.\d*)?)(?:(?:-(?:\d{2}):(?:\d{2})|Z)?)$/")
            date_subbed2 = date_reg2.sub("<DAT>", date_subbed)
            cleaned_news_file = clean(date_subbed2,no_line_breaks=True, # Cleaning the rest of the text.
                    no_urls=True,                  
                    no_emails=True,                   
                    no_numbers=True,
                    no_punct=False,
                    replace_with_number = "<NUM>")
            df.loc[i,"content"] = cleaned_news_file # returning the cleaned text to the right position in our dataframe
            
            concat_text = concat_text + cleaned_news_file # Making a string with all the content.
        else:
            pass 
        i += 1
    return df, concat_text

### Cleaning the file

In [4]:
df, news_file  = clean_file(df)

### Dropping the articles with types of "unknown" or NaN

In [5]:
## Dropping the "unknown" and nan types. 

a = ['rumor', 'hate', 'unreliable', 'conspiracy', 'clickbait', 'satire',
       'fake', 'reliable', 'bias', 'political', 'junksci']



df = df[df["type"].isin(a)]

### Dropping duplicate rows

In [6]:
## Dropping duplicate columns
     
df.drop_duplicates("content",inplace=True) 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26737 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   type     26737 non-null  object
 1   content  26737 non-null  object
dtypes: object(2)
memory usage: 626.6+ KB


In [8]:
def tokenize(text):
    """
    Takes a string of text and tokenizes it. 
    
    Returns a list of the tokenized text. 
    
    """
    token_list = text_to_word_sequence(text)
    words = [word for word in token_list if word.isalpha()]
    return words 
    


In [9]:
def removing_stopwords(token_list:list) -> list:
    """
    Takes a list of words and removes the stopwords 
    
    Returns a list of words
    """
    stop_words = set(stopwords.words('english'))
    words = [w for w in token_list if not w in stop_words]
    return words

In [10]:
def stemming(token_list:list) -> list:
    """
    Takes a list of words and stems the words
    
    Returns a list of stemmed words. 
    """
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in token_list]
    return stemmed 
    

In [11]:
def unique(list1):
    """
    Takes a list and drop all duplicates in the list
    
    Returns a list with no duplicates
    """
    unique_list = pd.Series(list1).drop_duplicates().tolist()
    return unique_list

### Inserting labels true or false for the article dependent on their types

In [12]:
## Making the labels for the articles. 

df_labels = df["type"].isin(["political", "reliable" , "clickbait"])

In [13]:
## Inserting the labels in the original Dataframe.

df.insert(loc = len(df.columns) , column = "label" ,value = df_labels)                              

### Removing stopwords and stemming of the content in each article.

In [None]:
## Stemming and removing stopwords of the documents. 

df["content"] = df["content"].apply(tokenize)
df["content"] = df["content"].apply(removing_stopwords)
df["content"] = df["content"].apply(stemming)

df["content"] = df["content"].apply(" ".join)

# Part 2 Feature enginering


#### Splitting the data into training, validation and test sets. 

First we want to figure out which labels we want to give to our articles. Using our dataset and the README documentation of the FakeNewsCorp Dataset we see that we have 11 (13 with nan and unknown) different types of articles.

We want to transform this into labels of either fake or reliable.
Fake and Satire types are obviosly fake, and reliable and political are as reliable as it gets with news. 

Now to label the rest is a bit more subjective. We have chosen to label clickbait as reliable, as the content itself is probably fine, but the headlines are the questional part. 

Conspiracy, bias and junksci kinda falls under the same category of not necessarily being fake, but is in the extreme end of opinion to the point most of it probably is fake. 
Although it is good to note that a lot of modern science was also "fake", before it became science. See for example germs. And just straight up dismissing everything that isn't established "truth" can also be dangerous. With that said we chose to label these three types as fake. 

Unreliable as the name suggests is unreliable. From the description on the README, it might be true but it needs more verification. Here it would be nice to have more classes than just reliable and fake so something that are very much in the middle could have a class to go to. 
For simplecity reasons we gave unreliable the label fake. 

The hate type is very difficult to place, because it's again not necessarily fake news, but at the same time it smells of extremist propaganda, and that tends to be more fake than true, or at least distorted in such a way that the "meat" isnt wrong per say but the conclusion is false. 
In the end classifying such news as fake is probably better from an ethical point of view, so we dont give legitimacy to rasicm, hate and other forms of discrimination. 

We are left with rumor, nan and unknown. 
Rumor is not described in the README, but since its a rumor it should probably be treated with caution until the rumor is confirmed either true or false. So we will label this as fake. 

We have removed nan and unknown from the dataset, since it doesnt really make sense to label it. We could have also just labeled it fake, and since its a very small percentage of the dataset the difference is probably not that big. 



### Splitting the dataset

In [None]:
X_train_temp, X_val, y_train_temp, y_val  = train_test_split(df, df_labels, test_size = 0.1, random_state=0) 
X_train, X_test, y_train, y_test = train_test_split(X_train_temp, y_train_temp, test_size = 1/9, random_state=0)

### Making the bag of words matrix. We are using the 1000 most frequent words in the corpus.

In [None]:
bow_vectorizer = CountVectorizer(min_df=5)

bow_vectorizer.fit(df["content"])
X_train_bow = bow_vectorizer.transform(X_train["content"])
X_val_bow = bow_vectorizer.transform(X_val["content"])
X_test_bow = bow_vectorizer.transform(X_test["content"])



### Making a TF-IDF Unigram feature.

In [None]:
unigram_vectorizer = TfidfVectorizer(max_features=1000)

unigram_vectorizer.fit(df["content"])

X_train_tfidf = unigram_vectorizer.transform(X_train["content"])
X_val_tfidf = unigram_vectorizer.transform(X_val["content"])
X_test_tfidf = unigram_vectorizer.transform(X_test["content"])

# Making a simple baseline model

We start by making a logistic regression model using BoW as features, and another model using unigram of tf_idf as our features.
Logistic regression is good for binary classification problems.

### Bag of Words logistric regression model


In [None]:
bow_lreg = LogisticRegression(random_state=0, max_iter=300)
bow_lreg.fit(X_train_bow, y_train)

### Tuning the hyper parameters with the validation set

In [None]:
lreg_bow_pred = bow_lreg.predict_proba(X_val_bow) # predicting on the validation set
lreg_bow_pred_int = lreg_bow_pred[:,1] >= 0.5 # if prediction is greater than or equal to 0.3 than 1 else 0


print(f"Accuracy score: {accuracy_score(y_val, lreg_bow_pred_int)} \n f1_score:{f1_score(y_val, lreg_bow_pred_int)}")


### Unigram TF-IDF logistic regression 

In [None]:
tf_lreg = LogisticRegression(random_state=0)
tf_lreg.fit(X_train_tfidf, y_train)

### Using the validation set to tune hyper parameters and feature parameters

In [None]:
unigram_pred = tf_lreg.predict_proba(X_val_tfidf)
unigram_pred_int = unigram_pred[:,1] >= 0.5

print(f"Accuracy score: {accuracy_score(y_val, unigram_pred_int)} \n f1_score:{f1_score(y_val, unigram_pred_int)}")

# Part 3 Making a more advanced model 

### We start by making more advanced features and use the TF-IDF bigram

In [None]:
tfidf_vect_ngram = TfidfVectorizer(ngram_range=(1,2))
tfidf_vect_ngram.fit(df["content"])

X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train["content"])
X_val_tfidf_ngram =  tfidf_vect_ngram.transform(X_val["content"])
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test["content"])

### Making naive bayes models.

We now try to make naive bayes models with our different features

In [None]:
tfidf_bigram_naive_MN = MultinomialNB()
tfidf_bigram_naive_MN.fit(X_train_tfidf_ngram, y_train)

In [None]:
naive_mn_pred = tfidf_bigram_naive_MN.predict_proba(X_val_tfidf_ngram)
naive_mn_pred_int = naive_mn_pred[:,1] >= 0.5

print(f"Accuracy score: {accuracy_score(y_val, naive_mn_pred_int)} \n f1_score:{f1_score(y_val, naive_mn_pred_int)}")     

In [None]:
tfidf_bigram_naive_C = ComplementNB()
tfidf_bigram_naive_C.fit(X_train_tfidf_ngram, y_train)

In [None]:
naive_c_pred = tfidf_bigram_naive_C.predict_proba(X_val_tfidf_ngram)
naive_c_pred_int = naive_c_pred[:,1] >= 0.5

print(f"Accuracy score: {accuracy_score(y_val, naive_c_pred_int)} \n f1_score:{f1_score(y_val, naive_c_pred_int)}")     

## Part 4 Evaluating performance on the test set, and trying our models on the Liar dataset

#### First we try evaluate our models on our test set. 

In [None]:
lreg_bow_pred = bow_lreg.predict_proba(X_test_bow) 
lreg_bow_pred_int = lreg_bow_pred[:,1] >= 0.5 # Setting a threshold of 0.5

print(f"BoW Log \n Accuracy score: {accuracy_score(y_test, lreg_bow_pred_int)} \n f1_score:{f1_score(y_test, lreg_bow_pred_int)}")

unigram_pred = tf_lreg.predict_proba(X_test_tfidf)
unigram_pred_int = unigram_pred[:,1] >= 0.5

print(f"Unigram Log \n Accuracy score: {accuracy_score(y_test, unigram_pred_int)} \n f1_score:{f1_score(y_test, unigram_pred_int)}")

naive_mn_pred = tfidf_bigram_naive_MN.predict_proba(X_test_tfidf_ngram)
naive_mn_pred_int = naive_mn_pred[:,1] >= 0.5

print(f"Naive Bayes Multinomial \n Accuracy score: {accuracy_score(y_test, naive_mn_pred_int)} \n f1_score:{f1_score(y_test, naive_mn_pred_int)}")    


naive_c_pred = tfidf_bigram_naive_C.predict_proba(X_test_tfidf_ngram)
naive_c_pred_int = naive_c_pred[:,1] >= 0.5

print(f"Naive Bayes Complement \n Accuracy score: {accuracy_score(y_test, naive_c_pred_int)} \n f1_score:{f1_score(y_test, naive_c_pred_int)}")     





### Then we make the confusion matrix for the different predictions. 

In [None]:
plt.figure(figsize=(15, 4))
predictions = {"Bow":lreg_bow_pred_int,"Unigram":unigram_pred_int, "Naive_mn":naive_mn_pred_int, "Niave_c":naive_c_pred_int}
    
for i, (name, pred) in enumerate(predictions.items()):
    cm = confusion_matrix(y_test, pred)
    
    plt.subplot(1, 4, i + 1)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f"{name}")
    plt.colorbar()
    plt.xticks([0, 1], ["Pred N", "Pred P"])
    plt.yticks([0, 1], ["N", "P"])
    plt.xlabel("Predicted Label")
    plt.ylabel("true Label")

plt.tight_layout()
plt.show()

### Now we try our models on the Liar Data set
First we read and clean the Liar data set 

We have decided to use the train data set because it is the biggest dataset and since we dont have to actually train a new model we just want as much data as possible to test on. 

In [None]:
liar_dataset = pd.read_table("liar_dataset/train.tsv", names=["ID", "label","content","subjects", "speaker", "speaker job title", "state info", "party affli", "barely true counts", "false counts", "half true counts", "mostly true counts", "pants on fire counts", "location"]  )

In [None]:
ld, liar_news_file = clean_file(liar_dataset)

In [None]:
ld["content"] = ld["content"].apply(tokenize)
ld["content"] = ld["content"].apply(removing_stopwords)
ld["content"] = ld["content"].apply(stemming)

ld["content"] = ld["content"].apply(" ".join)

### Then we label the classes. 
Here we have chosen to represent half-true, mostly-true and true as the true articles and the rest as false articles. 

In [None]:
true_list = ["half-true", "mostly-true", "true"]

In [None]:
ld["label"].value_counts()

In [None]:
## Transforming the labels
ld["label"] = ld["label"].isin(true_list)

In [None]:
y_ld = ld["label"]

### We then make the different feature sets for the different models. 

In [None]:
ld_bow = bow_vectorizer.transform(ld["content"])
ld_unigram = unigram_vectorizer.transform(ld["content"])
ld_ngram = tfidf_vect_ngram.transform(ld["content"])

### Now we measure our performance of our models on the Liar dataset. 

In [None]:
ld_bow_pred = bow_lreg.predict_proba(ld_bow) 
ld_bow_pred_int = ld_bow_pred[:,1] >= 0.5 # Setting a threshold of 0.5

print(f"BoW Log \n Accuracy score: {accuracy_score(y_ld, ld_bow_pred_int)} \n f1_score:{f1_score(y_ld, ld_bow_pred_int)}")

ld_unigram_pred = tf_lreg.predict_proba(ld_unigram)
ld_unigram_pred_int = ld_unigram_pred[:,1] >= 0.5

print(f"Unigram Log \n Accuracy score: {accuracy_score(y_ld, ld_unigram_pred_int)} \n f1_score:{f1_score(y_ld, ld_unigram_pred_int)}")

ld_naive_mn_pred = tfidf_bigram_naive_MN.predict_proba(ld_ngram)
ld_naive_mn_pred_int = ld_naive_mn_pred[:,1] >= 0.5

print(f"Naive Bayes Multinomial \n Accuracy score: {accuracy_score(y_ld, ld_naive_mn_pred_int)} \n f1_score:{f1_score(y_ld, ld_naive_mn_pred_int)}")    


ld_naive_c_pred = tfidf_bigram_naive_C.predict_proba(ld_ngram)
ld_naive_c_pred_int = ld_naive_c_pred[:,1] >= 0.5

print(f"Naive Bayes Complement \n Accuracy score: {accuracy_score(y_ld, ld_naive_c_pred_int)} \n f1_score:{f1_score(y_ld, ld_naive_c_pred_int)}")     

### Then we make the confusion matrix for the ld dataset

In [None]:
plt.figure(figsize=(15, 4))
ld_predictions = {"Bow":ld_bow_pred_int,"Unigram":ld_unigram_pred_int, "Naive_mn":ld_naive_mn_pred_int, "Niave_c":ld_naive_c_pred_int}
    
for i, (name, pred) in enumerate(ld_predictions.items()):
    cm = confusion_matrix(y_ld, pred)
    
    plt.subplot(1, 4, i + 1)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f"{name}")
    plt.colorbar()
    plt.xticks([0, 1], ["Pred N", "Pred P"])
    plt.yticks([0, 1], ["N", "P"])
    plt.xlabel("Predicted Label")
    plt.ylabel("true Label")

plt.tight_layout()
plt.show()

We can see that our models are terrible and it mainly just thinks that every article is true. 

### Trying out the model on completely different data from the data set.

### We start by preprocessing

In [None]:
df1 = pd.read_csv("news_cleaned_2018_02_13.csv", nrows=10000, skiprows=lambda x: x in range(1, 500000),usecols=["type", "content"])

In [None]:
df1, _ = clean_file(df1)

In [None]:
a = ['rumor', 'hate', 'unreliable', 'conspiracy', 'clickbait', 'satire',
       'fake', 'reliable', 'bias', 'political', 'junksci']

df1 = df1[df1["type"].isin(a)]

In [None]:
df1.drop_duplicates("content",inplace=True) 

In [None]:
df1_labels = df1["type"].isin(["political", "reliable" , "clickbait"])

In [None]:
df1.insert(loc = len(df1.columns) , column = "label" ,value = df1_labels)     

In [None]:
df1["content"] = df1["content"].apply(tokenize)
df1["content"] = df1["content"].apply(removing_stopwords)
df1["content"] = df1["content"].apply(stemming)
df1["content"] = df1["content"].apply(" ".join)

In [None]:
new_test = df1["label"]

## Then we make the feature sets 

In [None]:
df1_bow = bow_vectorizer.transform(df1["content"])
df1_unigram = unigram_vectorizer.transform(df1["content"])
df1_ngram = tfidf_vect_ngram.transform(df1["content"])

## We test the performance

In [None]:
new_lreg_bow_pred = bow_lreg.predict_proba(df1_bow) 
new_lreg_bow_pred_int = new_lreg_bow_pred[:,1] >= 0.5 # Setting a threshold of 0.5

print(f"BoW Log \n Accuracy score: {accuracy_score(new_test, new_lreg_bow_pred_int)} \n f1_score:{f1_score(new_test, new_lreg_bow_pred_int)}")

new_unigram_pred = tf_lreg.predict_proba(df1_unigram)
new_unigram_pred_int = new_unigram_pred[:,1] >= 0.5

print(f"Unigram Log \n Accuracy score: {accuracy_score(new_test, new_unigram_pred_int)} \n f1_score:{f1_score(new_test, new_unigram_pred_int)}")

new_naive_mn_pred = tfidf_bigram_naive_MN.predict_proba(df1_ngram)
new_naive_mn_pred_int = new_naive_mn_pred[:,1] >= 0.5

print(f"Naive Bayes Multinomial \n Accuracy score: {accuracy_score(new_test, new_naive_mn_pred_int)} \n f1_score:{f1_score(new_test, new_naive_mn_pred_int)}")    


new_naive_c_pred = tfidf_bigram_naive_C.predict_proba(df1_ngram)
new_naive_c_pred_int = new_naive_c_pred[:,1] >= 0.5

print(f"Naive Bayes Complement \n Accuracy score: {accuracy_score(new_test, new_naive_c_pred_int)} \n f1_score:{f1_score(new_test, new_naive_c_pred_int)}")     





## We make the confusion matrix

In [None]:
plt.figure(figsize=(15, 4))
new_predictions = {"Bow":new_lreg_bow_pred_int,"Unigram":new_unigram_pred_int, "Naive_mn":new_naive_mn_pred_int, "Niave_c":new_naive_c_pred_int}
    
for i, (name, pred) in enumerate(new_predictions.items()):
    cm = confusion_matrix(new_test, pred)
    
    plt.subplot(1, 4, i + 1)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f"{name}")
    plt.colorbar()
    plt.xticks([0, 1], ["Pred N", "Pred P"])
    plt.yticks([0, 1], ["N", "P"])
    plt.xlabel("Predicted Label")
    plt.ylabel("true Label")

plt.tight_layout()
plt.show()