In [2]:
# all the necessary imports
import re
import pandas as pd
import nltk
# nltk.download('punkt') ----> udkommenter denne linje, hvis du ikke har nltk installeret
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import svm

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix




In [None]:
data = pd.read_csv('995,000_rows.csv',dtype = str)

In [None]:

def cleanText(text):
    # lower case
    if not isinstance(text, str):
        text = str(text)
    
    if isinstance(text, str):
        text = text.lower()

    # should not contain multiple spaces, tabs or newlines
    text = re.sub(r'\s+', ' ', text)

    #  january 18, 2018. jan 18, 2018. 2018-01-18
    date_pattern = r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2}(?:,\s+|\s+)\d{4}\b|\b\d{4}-\d{2}-\d{2}\b'

    text = re.sub(date_pattern, '<DATE>', text)
    # nov. 5
    date_pattern2 = r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.\s+\d{1,2}\b'
    text = re.sub(date_pattern2, '<DATE>', text)

    # replace numbers with <NUM>
    text = re.sub(r'\d+', '<NUM>', text)

    # replace urls with <URL>
    text = re.sub(r'(http|https)://[^\s]*', '<URL>', text)

    # replace emails with <EMAIL>
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '<EMAIL>', text)


    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [None]:
# initial exploratory data analysis

# see how many domains are in the dataset
domains = data['domain'].value_counts()
# print(domains)

# see if there are any missing values in ['domain']
missing_domain = data['domain'].isnull().sum()
# print('Missing domain:', missing_domain)

# see if how many domains have 'reliable' as their type
reliable_domains = data[data['type'] == 'reliable']['domain'].value_counts()
# print(reliable_domains)

# see for each domain how many reliable and fake news articles they have
reliable_fake = data.groupby(['domain', 'type']).size()
print(reliable_fake)


# see how many diferent types there are and print them
types = data['type'].value_counts()
print(types)


# plot 10 domains with the most articles with their type

reliable_fake = reliable_fake.unstack()
# fillna(0) replaces all NaN values with 0
reliable_fake = reliable_fake.fillna(0)
reliable_fake['total'] = reliable_fake['reliable'] + reliable_fake['fake'] + reliable_fake['satire'] + reliable_fake['bias'] + reliable_fake['conspiracy'] + reliable_fake['hate']+ reliable_fake['junksci'] + reliable_fake['clickbait'] + reliable_fake['unreliable'] + reliable_fake['political'] + reliable_fake['rumor'] + reliable_fake['unknown'] 
reliable_fake = reliable_fake.sort_values(by='total', ascending=False)
reliable_fake = reliable_fake.head(10)
reliable_fake = reliable_fake.drop(columns='total')
reliable_fake.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.show()


# print what percent of the articles are of type reliable
reliable_percent = types['reliable'] / data.shape[0] * 100
print('Reliable percent:', reliable_percent)


In [None]:
# removing missing values, classifying the data,

data = data.dropna(subset=['type', 'content'])

# replace 'political' and 'clickbait' with 'reliable'
data['type'] = data['type'].replace(['political', 'clickbait'], 'reliable')

#  remove all the other types of news except 'reliable' and 'fake'
data = data[data['type'].isin(['reliable', 'fake'])]


print('Total rows after removing missing values:', data.shape[0])


In [None]:
# cleaning the text
data['content'] = data['content'].apply(cleanText)

In [None]:
# tokenization
data['tokens'] = data['content'].apply(nltk.word_tokenize)

In [None]:
# removing stopwords and stemming the tokens

stop_words = set(stopwords.words('english'))
# stemmer = SnowballStemmer("english")
stemmer = PorterStemmer() 

def process_tokens(tokens):
    # remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # stemming
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return stemmed_tokens

data['processed_tokens'] = data['tokens'].apply(process_tokens).apply(lambda x: ' '.join(x))
# data['processed_tokens'] = data['content'].apply(lambda x:' '.join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))

In [6]:
# reading the other cleaned data
cleaned_stemmed = pd.read_csv('995,000_cleaned_stemmed2.csv', dtype=str)

cleaned_stemmed = cleaned_stemmed.dropna(subset=['type', 'content'])

# replace 'political' and 'clickbait' with 'reliable'
cleaned_stemmed['type'] = cleaned_stemmed['type'].replace(['political', 'clickbait'], 'reliable')

# replace 'bias' and 'satite' with 'fake'
cleaned_stemmed['type'] = cleaned_stemmed['type'].replace(['bias', 'satire'], 'fake')
#  remove all the other types of news except 'reliable' and 'fake'
cleaned_stemmed = cleaned_stemmed[cleaned_stemmed['type'].isin(['reliable', 'fake'])]

print('Total rows after removing missing values:', cleaned_stemmed.shape[0]) 

print(cleaned_stemmed['type'].value_counts())

# percent reliable
reliable_percent = cleaned_stemmed['type'].value_counts()['reliable'] / cleaned_stemmed.shape[0] * 100
print('Reliable percent:', reliable_percent)

Total rows after removing missing values: 691768
type
reliable    440493
fake        251275
Name: count, dtype: int64
Reliable percent: 63.6764059626927


In [7]:
# using either CountVectorizer or TfidfVectorizer to convert the text data into numerical data
CountVectorizer = CountVectorizer()
tfidvec = TfidfVectorizer(stop_words='english',max_df=0.7)

In [8]:
# split the data into training and testing sets

# X = CountVectorizer.fit_transform(cleaned_stemmed['content'])
X = tfidvec.fit_transform(cleaned_stemmed['content'])
# X = CountVectorizer.fit_transform(data['processed_tokens'])


X_train, X_test_1, y_train, y_test_1  = train_test_split(X, cleaned_stemmed['type'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_1, y_test_1, test_size=0.5, random_state=42)

In [9]:
# read the liad tsv file
liar_data_test = pd.read_csv('LIARDATASET/train.tsv', sep='\t', header=None)

# only keep the 'true' and 'false' labels
liar_data_test = liar_data_test[liar_data_test[1].isin(['true', 'false'])]

X_liar = liar_data_test[2]
y_liar = liar_data_test[1]

# change the labels to 'reliable' and 'fake'
y_liar = y_liar.replace('true', 'reliable')
y_liar = y_liar.replace('false', 'fake')



In [10]:
# scaling the data

scale = StandardScaler(with_mean=False)

X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [11]:
# training baseline model: logistic regression

logistic_model = LogisticRegression(max_iter=3500, random_state=42)

logistic_model.fit(X_train, y_train)

In [5]:
# predicting: testing set

y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)


            0      1                                                  2   \
0    2635.json  false  Says the Annies List political group supports ...   
3    1123.json  false  Health care reform legislation is likely to ma...   
5   12465.json   true  The Chicago Bears have had more starting quart...   
12   5947.json  false  When Mitt Romney was governor of Massachusetts...   
16    620.json   true  McCain opposed a requirement that the governme...   

                      3             4                           5   \
0               abortion  dwayne-bohac        State representative   
3            health-care  blog-posting                         NaN   
5              education     robin-vos  Wisconsin Assembly speaker   
12  history,state-budget   mitt-romney             Former governor   
16        federal-budget  barack-obama                   President   

               6           7     8     9      10     11    12  \
0           Texas  republican   0.0   1.0    0.0    0.0  

In [None]:
# training svm model:
clf = svm.LinearSVC(max_iter=5000, random_state=42)

clf.fit(X_train, y_train)


In [None]:
# predicting with the svm model

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

In [None]:
# training random forest model

clf = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs = -1)
clf.fit(X_train, y_train)

In [None]:
# predicitng with the random forest model

y_pred = clf.predict(X_test)
print("accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
# training naive bayes model

naive = MultinomialNB()
naive.fit(X_train, y_train)

In [None]:
# predicting with the naive bayes model
y_pred = naive.predict(X_test)

accuracy = accuracy_score(y_val, y_pred)
print("accuracy: ", accuracy)

In [None]:
# training neural network model

mlp = MLPClassifier(random_state=42, max_iter=500, hidden_layer_sizes=(128,64))

mlp.fit(X_train, y_train)

In [None]:
# predicting with the neural network model

y_pred = mlp.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)