In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
Headlines = pd.read_csv('./archive/abcnews-date-text.csv', usecols =["headline_text"]).dropna()
Headlines1 = pd.read_csv('./archive/fake.csv', usecols =["title"]).dropna()
Headlines2 = pd.read_csv('./archive/Fake_2.csv', usecols =["title"]).dropna()

In [3]:
#Removing duplicated headlines
Headlines = Headlines.drop_duplicates('headline_text')
Headlines1 = Headlines1.drop_duplicates('title')
Headlines2 = Headlines2.drop_duplicates('title')

#Rename dataframe columns and Combine all datasets
Headlines1 = Headlines1.rename(columns={'title': 'headline_text'})
Headlines2 = Headlines2.rename(columns={'title': 'headline_text'})

In [4]:
# set fake news to 1
Headlines1['label'] = 1
Headlines2['label'] = 1
Headlines['label'] = 0

In [5]:
Headlines_fake = pd.concat([Headlines1, Headlines2], ignore_index=True)

# Combine all datasets
data = pd.concat([Headlines.head(25000), Headlines_fake.head(25000)], ignore_index=True)
print(Headlines.head())

                                       headline_text  label
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      0
2     a g calls for infrastructure protection summit      0
3           air nz staff in aust strike for pay rise      0
4      air nz strike to affect australian travellers      0


                                  headline_text  label
0  new zealand wales have chance to make amends      0
1   n korea accuses asean forum of interference      0
2                    nrma board to be downsized      0
3           nsw debates short sentencing issues      0
4          nsw govt urged to provide ward funds      0


In [7]:
# Data Preprocessing
import gensim
import nltk as nl
nl.download('stopwords')
from sklearn.feature_extraction import _stop_words


nltk_stopwords = nl.corpus.stopwords.words('english')
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
sklearn_stopwords = _stop_words.ENGLISH_STOP_WORDS
combined_stopwords = sklearn_stopwords.union(nltk_stopwords,gensim_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print('NLTK has {} stop words'.format(len(nltk_stopwords)))
print('Gensim has {} stop words'.format(len(gensim_stopwords)))
print('Sklearn has {} stop words'.format(len(sklearn_stopwords)))
print('Combined stopwords list has {} stop words'.format(len(combined_stopwords)))

NLTK has 179 stop words
Gensim has 337 stop words
Sklearn has 318 stop words
Combined stopwords list has 390 stop words


In [9]:
from nltk.stem import PorterStemmer 
porter_stemmer = PorterStemmer() 

In [10]:
data['headline_text'] = data['headline_text'].apply(lambda x: x.lower())
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (combined_stopwords)]))

In [12]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(data['headline_text'], data['label'], test_size=0.2, random_state=42)
print(type(y_test))

<class 'pandas.core.series.Series'>


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [14]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = word_tokenize, max_features = 300)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
tfidf_features = tfidf_vectorizer.get_feature_names_out()



In [15]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC(kernel='linear')
knn = KNeighborsClassifier()
nb = MultinomialNB()

dt.fit(tfidf_train, y_train)
rf.fit(tfidf_train, y_train)
svc.fit(tfidf_train, y_train)
knn.fit(tfidf_train, y_train)
nb.fit(tfidf_train, y_train)

In [16]:
print ("Testing Acc. of Decision Tree: {} %".format(round(dt.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Random Forest: {} %".format(round(rf.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of SVC: {} %".format(round(svc.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of K-NN: {} %".format(round(knn.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Naive Bayesian: {} %".format(round(nb.score(tfidf_test, y_test) * 100, 2)))

Testing Acc. of Decision Tree: 82.93 %
Testing Acc. of Random Forest: 83.86 %
Testing Acc. of SVC: 83.85 %
Testing Acc. of K-NN: 74.38 %
Testing Acc. of Naive Bayesian: 78.28 %


In [17]:
tfidf_train = tfidf_train.todense()
tfidf_test = tfidf_test.todense()

neural_network = Sequential()
neural_network.add(Dense(64, input_dim=len(tfidf_features), activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(64, activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(1, activation='sigmoid'))
neural_network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = neural_network.fit(tfidf_train, y_train, epochs=50, batch_size=512, verbose=0)
_,test_acc = neural_network.evaluate(tfidf_test,y_test,verbose=0)
print ("Testing Acc. of DNN: {} %".format(round(test_acc * 100, 2)))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Testing Acc. of DNN: 84.19 %


In [29]:
# Assuming you have already imported necessary libraries and defined your models (dt, rf, svc, knn, nb, neural_network)

# Concatenate the true and fake headlines
test_data = pd.concat([Headlines[25000:31000], Headlines_fake[25000:29000]], ignore_index=True)

# Preprocess the headline text
test_data['headline_text'] = test_data['headline_text'].apply(lambda x: x.lower())
test_data['headline_text'] = test_data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
test_data['headline_text'] = test_data['headline_text'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split()]))
test_data['headline_text'] = test_data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in combined_stopwords]))

true_labels = test_data['label']

# Transform the headline text using the TF-IDF vectorizer
tfidf_test_1 = tfidf_vectorizer.transform(test_data['headline_text'])
tfidf_test_1 = tfidf_test_1.toarray()

# Evaluate model accuracies
print("Testing Acc. of Decision Tree: {} %".format(round(dt.score(tfidf_test_1, true_labels) * 100, 2)))
print("Testing Acc. of Random Forest: {} %".format(round(rf.score(tfidf_test_1, true_labels) * 100, 2)))
print("Testing Acc. of SVC: {} %".format(round(svc.score(tfidf_test_1, true_labels) * 100, 2)))
print("Testing Acc. of K-NN: {} %".format(round(knn.score(tfidf_test_1, true_labels) * 100, 2)))
print("Testing Acc. of Naive Bayesian: {} %".format(round(nb.score(tfidf_test_1, true_labels) * 100, 2)))
print("Testing Acc. of DNN: {} %".format(round(neural_network.evaluate(tfidf_test_1, true_labels, verbose=0)[1] * 100, 2)))


Testing Acc. of Decision Tree: 85.09 %
Testing Acc. of Random Forest: 86.6 %
Testing Acc. of SVC: 86.88 %
Testing Acc. of K-NN: 80.56 %
Testing Acc. of Naive Bayesian: 75.59 %
Testing Acc. of DNN: 87.01 %
