In [None]:
import nltk
import string
import scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Part 1

In [None]:
# import the Fake.csv and True.csv files
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [None]:
# Combine all of the true data into one list and tokesnize it
true_string = ''.join(true["text"])
true_tokenized = nltk.word_tokenize(true_string)

In [None]:
# Combine all of the fake data into one list and tokesnize it
fake_string = ''.join(fake["text"])
fake_tokenized = nltk.word_tokenize(fake_string)

In [None]:
# Make all the words lowercase and remove the punctuation
true_tokenized = [word.lower() for word in true_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Make all the words lowercase and remove the punctuation
fake_tokenized = [word.lower() for word in fake_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Plot the 100 most common words in the true data
fig = plt.figure(figsize = (100,20))
fdist_true.plot(100,cumulative=False)
fig.savefig('true_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Plot the 100 most common words in the fake data
fig = plt.figure(figsize = (100,20))
fdist_fake.plot(100,cumulative=False)
fig.savefig('fake_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Remove the stop words from the true data
stop_words = stopwords.words('english')
true_tokenized = [word for word in true_tokenized if word not in stop_words]

In [None]:
# Remove the stop words from the fake data
fake_tokenized = [word for word in fake_tokenized if word not in stop_words]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Lemmatize the words in the true data
lemmatizer = WordNetLemmatizer()
true_tokenized = [lemmatizer.lemmatize(word) for word in true_tokenized]

In [None]:
# Lemmatize the words in the fake data
fake_tokenized = [lemmatizer.lemmatize(word) for word in fake_tokenized]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

# Part 2

In [None]:
# Set the true and fake labels
true['label']=0
fake['label']=1
# Convert the true and fake data into a dataframe
true_pandas = pd.DataFrame(true.to_numpy())
fake_pandas = pd.DataFrame(fake.to_numpy())
# Combine the true and fake data into one dataframe
data = pd.concat([true_pandas,fake_pandas])
# Print the dataframe
print(data[4])

In [None]:

# Remove unwanted elements from out data like symbols and numbers and tokenize the data
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
data_clean = cv.fit_transform(data[1])

In [None]:
# Create a dataset from the tokenized data
X_train, X_test, y_train, y_test = train_test_split(data_clean, data[4].astype("int"), test_size=0.2, random_state=0)

In [None]:
# Create a TF-IDF vectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[1])

In [None]:
# Create a dataset from the TF-IDF vectorized data
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(text_tf, data[4].astype("int"), test_size=0.3, random_state=123)

In [None]:
# Raw Dataset
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = MultinomialNB().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = LogisticRegression().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = LogisticRegression().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Classification using SVM
# Raw datset
clf = SVC().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = SVC().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("SVM Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = RandomForestClassifier().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))


In [None]:
# Tfidf dataset
clf = RandomForestClassifier().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

# Part 3

In [None]:
# After applying "POS Tagging", you can locate specific kinds of words in the collection, e.g.,nouns and verbs. 
# Please build additional classifier(s) to classify the news by leveraging POSinformation, e.g., only use the "nouns" or "adj" + "noun" as features. 
# Did you witness theperformance improvement (compared with the result from task 2)? why?

# Pos tag then do lemmatization 

In [None]:
# Pos tag 
for sentence in pos_tag_true:
	for word in sentence:
		pos_tag_true = nltk.pos_tag(word)
		pos_tag_fake = nltk.pos_tag()

In [None]:
lemmatizer = WordNetLemmatizer()

In [15]:
# Create a list of lemmatized pos tagged words for true data
pos_tag_tokens_true = []
for p in true['text']:
	token = nltk.word_tokenize(p)
	pos = nltk.pos_tag(token)
	lem = [(lemmatizer.lemmatize(word[0]), word[1]) for word in pos if word[1] == "NN"]
	pos_tag_tokens_true.append(lem)

In [None]:
# Create a list of lemmatized pos tagged words for fake data
pos_tag_tokens_fake = []
for p in fake['text']:
	token = nltk.word_tokenize(p)
	pos = nltk.pos_tag(token)
	lem = [(lemmatizer.lemmatize(word[0]), word[1]) for word in pos if word[1] == "NN"]
	pos_tag_tokens_fake.append(lem)

In [None]:
# Lemmatizatze the pos_tag
# Lemmatize the words in the true data 
lemmatizer = WordNetLemmatizer()
for i in pos_tag_tokens_true: 
	true_pos_tokenized = [(lemmatizer.lemmatize(word[0]), word[1]) for word in sentence for sentence in pos_tag_true]
# fake_pos_tokenized = [(lemmatizer.lemmatize(word[0]), word[1]) for word in pos_tag_fake]

In [None]:
print(true_pos_tokenized[:10])
print(len(fake_pos_tokenized))

In [None]:
# POS tag the data
full_tokenized = true_tokenized + fake_tokenized
pos_tag = nltk.pos_tag(full_tokenized)

In [None]:
# Create a list of just nouns from pos_tag_tokens_true
nouns_true = []
for p in pos_tag_tokens_true:
	nouns = [word for word,pos in p if pos == 'NN']
	nouns_true.append(nouns)

In [None]:
# Create a list of just nouns from pos_tag_tokens_fake
nouns_fake = []
for p in pos_tag_tokens_fake:
	nouns = [word for word,pos in p if pos == 'NN']
	nouns_fake.append(nouns)

In [None]:
nouns = nouns_true + nouns_fake

In [None]:
print(nouns[-10:])

In [None]:
# Create a list of labels for the nouns
nouns_labels = [0] * len(nouns_true) + ([1] * len(nouns_fake))

In [None]:
# Create dataset split with nouns
X_train_nouns, X_test_nouns, y_train_nouns, y_test_nouns = train_test_split(nouns, nouns_labels, test_size=0.3, random_state=123)

In [None]:
print(nouns_labels[:10])
print(nouns_labels[-10:])

In [None]:
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(pos_tag, nouns_labels, test_size=0.3, random_state=123)

In [None]:
# Nouns dataset
clf = MultinomialNB().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = LogisticRegression().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = SVC().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("SVM Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = RandomForestClassifier().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))