In [4]:
import nltk
import string
import scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from multiprocessing import Pool, TimeoutError

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gabe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Part 1

In [6]:
# import the Fake.csv and True.csv files
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [None]:
# Combine all of the true data into one list and tokesnize it
true_string = ''.join(true["text"])
true_tokenized = nltk.word_tokenize(true_string)

In [None]:
# Combine all of the fake data into one list and tokesnize it
fake_string = ''.join(fake["text"])
fake_tokenized = nltk.word_tokenize(fake_string)

In [None]:
# Make all the words lowercase and remove the punctuation
true_tokenized = [word.lower() for word in true_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Make all the words lowercase and remove the punctuation
fake_tokenized = [word.lower() for word in fake_tokenized if word.isalnum() or (not "." and not "," and not "?" and not "!")]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Plot the 100 most common words in the true data
fig = plt.figure(figsize = (100,20))
fdist_true.plot(100,cumulative=False)
fig.savefig('true_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Plot the 100 most common words in the fake data
fig = plt.figure(figsize = (100,20))
fdist_fake.plot(100,cumulative=False)
fig.savefig('fake_freq.png', bbox_inches = "tight")
plt.show()

In [None]:
# Remove the stop words from the true data
stop_words = stopwords.words('english')
true_tokenized = [word for word in true_tokenized if word not in stop_words]

In [None]:
# Remove the stop words from the fake data
fake_tokenized = [word for word in fake_tokenized if word not in stop_words]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

In [None]:
# Lemmatize the words in the true data
lemmatizer = WordNetLemmatizer()
true_tokenized = [lemmatizer.lemmatize(word) for word in true_tokenized]

In [None]:
# Lemmatize the words in the fake data
fake_tokenized = [lemmatizer.lemmatize(word) for word in fake_tokenized]

In [None]:
# Print the 100 most common words in the true data
fdist_true = FreqDist(true_tokenized)
print("True: " + str(fdist_true.most_common(100)))

In [None]:
# Print the 100 most common words in the fake data
fdist_fake = FreqDist(fake_tokenized)
print("Fake: " + str(fdist_fake.most_common(100)))

# Part 2

In [22]:
# Set the true and fake labels
true['label']=0
fake['label']=1
# Convert the true and fake data into a dataframe
true_pandas = pd.DataFrame(true.to_numpy())
fake_pandas = pd.DataFrame(fake.to_numpy())
# Combine the true and fake data into one dataframe
data = pd.concat([true_pandas,fake_pandas])
# Print the dataframe
print(data[4])

0        0
1        0
2        0
3        0
4        0
        ..
23476    1
23477    1
23478    1
23479    1
23480    1
Name: 4, Length: 44898, dtype: object


In [None]:

# Remove unwanted elements from out data like symbols and numbers and tokenize the data
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
data_clean = cv.fit_transform(data[1])

In [None]:
# Create a dataset from the tokenized data
X_train, X_test, y_train, y_test = train_test_split(data_clean, data[4].astype("int"), test_size=0.2, random_state=0)

In [None]:
# Create a TF-IDF vectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[1])

In [None]:
# Create a dataset from the TF-IDF vectorized data
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(text_tf, data[4].astype("int"), test_size=0.3, random_state=123)

In [None]:
# Raw Dataset
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = MultinomialNB().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = LogisticRegression().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = LogisticRegression().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Classification using SVM
# Raw datset
clf = SVC().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))

In [None]:
# Tfidf dataset
clf = SVC().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("SVM Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

In [None]:
# Raw dataset
clf = RandomForestClassifier().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision: " + str(metrics.precision_score(y_test, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test, predicted, average='macro')))


In [None]:
# Tfidf dataset
clf = RandomForestClassifier().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_tf, predicted))
print("Precision: " + str(metrics.precision_score(y_test_tf, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_tf, predicted, average='macro')))

# Part 3

In [None]:
# After applying "POS Tagging", you can locate specific kinds of words in the collection, e.g.,nouns and verbs. 
# Please build additional classifier(s) to classify the news by leveraging POSinformation, e.g., only use the "nouns" or "adj" + "noun" as features. 
# Did you witness theperformance improvement (compared with the result from task 2)? why?

# Pos tag then do lemmatization 

In [7]:
lemmatizer = WordNetLemmatizer()

In [31]:
data[1].shape

(44898,)

In [34]:
def lemmatize(p : list):
	token = nltk.word_tokenize(p)
	pos = nltk.pos_tag(token)
	lem = [lemmatizer.lemmatize(word[0]) for word in pos if word[1] == "NN"]
	return lem

In [36]:
with Pool(processes=8) as pool:
	nouns = pool.map(lemmatize, data[1])

In [None]:
with Pool(processes=8) as pool:
	pos_tag_tokens_false = pool.map(lemmatize, data[1])

In [18]:
nouns = pos_tag_tokens_true + pos_tag_tokens_fake

In [19]:
# Create a list of labels for the nouns
nouns_labels = [0] * len(pos_tag_tokens_true) + ([1] * len(pos_tag_tokens_fake))

In [20]:
# Create dataset split with nouns
X_train_nouns, X_test_nouns, y_train_nouns, y_test_nouns = train_test_split(nouns, data[4].astype("int"), test_size=0.3, random_state=123)

In [21]:
# Nouns dataset
clf = MultinomialNB().fit(X_train_nouns, y_train_nouns)
predicted= clf.predict(X_test_nouns)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_nouns, predicted))
print("Precision: " + str(metrics.precision_score(y_test_nouns, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_nouns, predicted, average='macro')))

  return array(a, dtype, copy=False, order=order)


ValueError: Expected 2D array, got 1D array instead:
array=[list(['student', 'art', 'contest', 'painting', 'stir', 'painting', 'symbol', 'symbol', 'islam', 'hijab', 'student', 'hijab', 'congressman', 'office', 'violation', 'separation', 'church', 'state', 'activist', 'group', 'enforcement', 'immigration', 'laws.The', 'group', 'success', 'painting', 'congressman', 'student', 'art', 'competition.Because', 'complaint', 'advice', 'issue', 'protest', 'district', 'office', 'example', 'congressman', 'resident', 'activist', 'anything', 'office'])
 list(['Tune', 'broadcast', 'broadcast', 'talk', 'radio', 'custom-made', 'bar', 'fly', 'street', 'corner', 'media-maniacs', 'rascals.Join', 'contributor', 'contributor', 'episode', 'tune', 'hang', 'boil', 'analysis', 'gnashing', 'teeth', 'reject', 'club.This', 'week', 'show', 'role', 'engineering', 'protest', 'news', 'death', 'share', 'program', 'donate', 'page', 'Reference'])
 list(['favoring', 'dispute', 'neighbor', 'way', 'development', 'drilling', 'multi-billion', 'dollar', 'deepwater', 'oil', 'gas', 'project', 'row', 'development', 'oil', 'oil', 'percent', 'world', 's', 'cocoa', 'd', 'official', 'court', 'tribunal', 'correspond', 'claim', 'party', 'angle', 'line', 'news', 'judgment', 'position', 'television', 'dispute', 'work', 'drilling', 'part', 'plan', 'development', 'potential', 'executive', 'oil', 'company', 'lead', 'operator', 'project', 'statement', 'end', 'year', 'production', 'capacity', 'production', 'storage', 'vessel', 'day', 'bpd', 'project', 'ruling', 'claim', 'line', 'relief', 'oil', 'growth', 'crisis', 'compensation', 'oil', 'field', 'area', 'claim', 'loss', 'contract', 'loss', 'revenue', 'economy', 'debt', 'project', 'gold', 'producer', 'oil', 'neighbor', 'oil', 'claim', 'deal', 'border', 'sea'])
 ...
 list(['Defense', 'military', 'ambush', 'week', 'incident', 'affiliate', 'spotlight', 'counterterrorism', 'mission', 'country', 'incident', 'time', 'ambush', 'place', 'area', 'enemy', 'patrol', 'aircraft', 'delay', 'something', 'stance', 'lot', 'board', 'plane', 'patrol', 'dozen', 'attack', 'dozen', 'security', 'source', 'ambush', 'diplomat', 'knowledge', 'incident', 'intelligence', 'contingency', 'place', 'combat', 'mission', 'assistance', 'army', 'intelligence', 'surveillance', 'reconnaissance'])
 list(['conviction', 'gay', 'man', 'officer', 's', 'conviction', 'judge', 'gay', 'vice', 'squad.The', 'judge', 's', 'client', 's', 'conviction', 'conduct', 'exposure', 'nature', 'conduct', 'community', 's', 'case', 'officer', 'restroom', 'part', 'officer', 'himself.Dhanidina', 'presence', 'decoy', 'judge', 'vice', 'squad', 'conduct', 'police', 'department', 'sex', 'stings.While', 'police', 'department', 'response', 'conduct', 'judge', 'argument', 'evidence', 'police', 'gay', 'men.Jim', 'spokesman', 'police', 'sex', 'councilman', 'expert', 'witness', 'conduct', 'era', 'homosexuality', 'leftover', 'century', 'judge', 's', 'decision', 'step', 'police', 'country.Last', 'year', 'police', 'ban', 'video', 'way', 'racism', 'way', 'homophobia', 'multitude', 'police', 'target', 'entrap', 'criminalize', 'community.Photo'])
 list(['gold', 'trader', 'support', 'name', 'business', 'trader', 'court', 'filing', 'lira', 'percent', 'dollar', 'filing', 'wrongdoing', 'case', 'relationship', 'lawyer', 'request', 'comment', 'government', 'government', 'scheme', 'banker', 'custody', 'trial', 'filing', 's', 'delay', 'amount', 'work', 'difficulty', 'trial', 'preparation', 'jail', 'filing', 'government', 'evidence', 'trial', 'government', 'banking', 'scheme', 'filing', 'phone', 'call', 'defendant', 'bank', 'conduit', 'minister', 'wedding', 'day', 'wedding', 'co-defendant', 'license', 'bank', 'regulator', 'bank', 'filing', 's', 'case', 'defendant', 'trial', 'speculation', 'plea', 'case', 'et', 'al'])].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# POS Tagging dataset
clf = LogisticRegression().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = SVC().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("SVM Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))

In [None]:
# POS Tagging dataset
clf = RandomForestClassifier().fit(X_train_pos, y_train_pos)
predicted= clf.predict(X_test_pos)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_pos, predicted))
print("Precision: " + str(metrics.precision_score(y_test_pos, predicted, average='macro')))
print("Recall: " + str(metrics.recall_score(y_test_pos, predicted, average='macro')))