In [1]:
import nltk
import string
import scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from multiprocessing import Pool, TimeoutError

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gabe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# import the Fake.csv and True.csv files
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

# Part 1

In [None]:
# Combine all of the true data into one list and tokesnize it
true_string = ''.join(true["text"])
true_tokenized = nltk.word_tokenize(true_string)

In [None]:
# Combine all of the fake data into one list and tokesnize it
fake_string = ''.join(fake["text"])
fake_tokenized = nltk.word_tokenize(fake_string)

In [None]:
# Make all the words lowercase and remove the punctuation
true_tokenized = [word.lower() for word in true_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Make all the words lowercase and remove the punctuation
fake_tokenized = [word.lower() for word in fake_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Plot the 100 most common words in the true data
fig = plt.figure(figsize = (100,20))
fdist_true.plot(100,cumulative=False)
fig.savefig('true_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Plot the 100 most common words in the fake data
fig = plt.figure(figsize = (100,20))
fdist_fake.plot(100,cumulative=False)
fig.savefig('fake_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Remove the stop words from the true data
stop_words = stopwords.words('english')
true_tokenized = [word for word in true_tokenized if word not in stop_words]

In [None]:
# Remove the stop words from the fake data
fake_tokenized = [word for word in fake_tokenized if word not in stop_words]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Lemmatize the words in the true data
lemmatizer = WordNetLemmatizer()

with Pool(processes=8) as pool:
	true_tokenized = pool.map(lemmatizer.lemmatize, true_tokenized)
# true_tokenized = [lemmatizer.lemmatize(word) for word in true_tokenized]

In [None]:
# Lemmatize the words in the fake data
with Pool(processes=8) as pool:
	fake_tokenized = pool.map(lemmatizer.lemmatize, fake_tokenized)

# fake_tokenized = [lemmatizer.lemmatize(word) for word in fake_tokenized]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

# Part 2

In [None]:
# Set the true and fake labels
true['label']=0
fake['label']=1
# Convert the true and fake data into a dataframe
true_pandas = pd.DataFrame(true.to_numpy())
fake_pandas = pd.DataFrame(fake.to_numpy())
# Combine the true and fake data into one dataframe
data = pd.concat([true_pandas,fake_pandas])
# Print the dataframe
print(data[4])

In [None]:

# Remove unwanted elements from out data like symbols and numbers and tokenize the data
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
data_clean = cv.fit_transform(data[1])

In [None]:
print(data[1])

In [None]:
# Create a dataset from the tokenized data
X_train, X_test, y_train, y_test = train_test_split(data_clean, data[4].astype("int"), test_size=0.2, random_state=0)

In [None]:
# Create a TF-IDF vectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[1])

In [None]:
# Create a dataset from the TF-IDF vectorized data
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(text_tf, data[4].astype("int"), test_size=0.3, random_state=123)

In [None]:
# Raw Dataset
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = MultinomialNB().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = LogisticRegression().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = LogisticRegression().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Classification using SVM
# Raw datset
clf = SVC().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = SVC().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("SVM Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = RandomForestClassifier().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))


In [None]:
# Tfidf dataset
clf = RandomForestClassifier().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

# Part 3

## Using just Nouns

In [3]:
lemmatizer = WordNetLemmatizer()


In [4]:
def lemmatize(p : list):
	token = nltk.word_tokenize(p)
	pos = nltk.pos_tag(token)
	lem = [(word[1], lemmatizer.lemmatize(word[0])) for word in pos if word[1] == "NN"]
	return lem

In [5]:
with Pool(processes=8) as pool:
	pos_tag_true = pool.map(lemmatize, true['text'])

In [6]:
with Pool(processes=8) as pool:
	pos_tag_fake = pool.map(lemmatize, fake['text'])

In [7]:
def join_nouns(p : list):
	return " ".join([word[1] for word in p])

In [8]:
pos_tag_nouns = pos_tag_true + pos_tag_fake

In [9]:
with Pool(processes=8) as pool:
	nouns_joined = pool.map(join_nouns, pos_tag_nouns)

In [11]:
nouns_labels = [0] * len(true) + [1] * len(fake)

In [21]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
nouns_tmp = cv.fit_transform(nouns_joined)

In [23]:
# Create dataset split with nouns
X_train_nouns, X_test_nouns, y_train_nouns, y_test_nouns = train_test_split(nouns_tmp, nouns_labels, test_size=0.3, random_state=123)

In [24]:
# Nouns dataset
clf = MultinomialNB().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

MultinomialNB Accuracy: 0.9184112843355605
Precision: 0.9177836130165402
Recall: 0.9187189461293814


In [28]:
# POS Tagging dataset
clf = LogisticRegression().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

Logistic Regression Accuracy: 0.9536748329621381
Precision: 0.9541114941525077
Recall: 0.9528774814562144


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# POS Tagging dataset
clf = SVC().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("SVM Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = RandomForestClassifier().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

## Using Nouns and Verbs