In [1]:
import pandas as pd
import os

In [2]:
folder = 'fakeNewsDataset'

labels = {'legit': 1, 'fake': 0}

df = pd.DataFrame()

In [3]:
for f in ('fake', 'legit'):
    path = os.path.join(folder, f)
    for file in os.listdir(path):
        with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
            txt = infile.read()
            df = df.append([[txt, labels[f]]], ignore_index = True)

df.columns = ['Text', 'Truth']
print(df)

                                                  Text  Truth
0    Alex Jones Vindicated in "Pizzagate" Controver...      0
1    THE BIG DATA CONSPIRACY\n\nGovernment and Sili...      0
2    California Surprisingly Lenient on Auto Emissi...      0
3    Mexicans Are Chomping at the Bit to Stop NAFTA...      0
4    Breaking News: Snapchat to purchase Twitter fo...      0
5    Brexit talks are seeing success: José Manuel B...      0
6    Robots Taking Over the World \n\nRobots are sl...      0
7    BrewDog under fire for accusations of canine i...      0
8    "Tesco will not pay out any money to settle in...      0
9    Uber to open new headquarters in Denmark despi...      0
10   EU Applauds Deutsche Boerse's $14 Billion Take...      0
11   Toshiba's Westinghouse creating thriving job m...      0
12   Ford has been forced by Donald Trump to pull o...      0
13   Amazon to sell Middle East online retailer Sou...      0
14   Wells Fargo profits spike despite legal costs\...      0
15   Elo

In [4]:
df.to_csv('fake_news_data.csv', index=False, encoding='utf-8')

In [5]:
df.head()

Unnamed: 0,Text,Truth
0,"Alex Jones Vindicated in ""Pizzagate"" Controver...",0
1,THE BIG DATA CONSPIRACY\n\nGovernment and Sili...,0
2,California Surprisingly Lenient on Auto Emissi...,0
3,Mexicans Are Chomping at the Bit to Stop NAFTA...,0
4,Breaking News: Snapchat to purchase Twitter fo...,0


In [6]:
df.tail()

Unnamed: 0,Text,Truth
475,Machine Learning Opens Up New Ways to Help Dis...,1
476,YouTube automates sound effect captions with A...,1
477,Solar-powered 'skin' could make prosthetics mo...,1
478,Uber Self-Driving Car Tests Resume Three Days ...,1
479,Apple's Devices Lose Luster in American Classr...,1


Unnamed: 0,Text,Truth
475,Machine Learning Opens Up New Ways to Help Dis...,1
476,YouTube automates sound effect captions with A...,1
477,Solar-powered 'skin' could make prosthetics mo...,1
478,Uber Self-Driving Car Tests Resume Three Days ...,1
479,Apple's Devices Lose Luster in American Classr...,1


In [6]:
import string
import nltk
from nltk.tokenize import word_tokenize

df['Tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['Text'].lower().translate(str.maketrans('','',string.punctuation))), axis=1)

df.Tokenized[0]

['alex',
 'jones',
 'vindicated',
 'in',
 'pizzagate',
 'controversy',
 'alex',
 'jones',
 'purveyor',
 'of',
 'the',
 'independent',
 'investigative',
 'news',
 'website',
 'infowars',
 'and',
 'host',
 'of',
 'the',
 'alex',
 'jones',
 'show',
 'has',
 'been',
 'vindicated',
 'in',
 'his',
 'claims',
 'regarding',
 'the',
 'socalled',
 'pizzagate',
 'controversy',
 'jones',
 'and',
 'others',
 'uncovered',
 'evidence',
 'last',
 'year',
 'that',
 'top',
 'democratic',
 'party',
 'officials',
 'were',
 'involved',
 'in',
 'a',
 'bizarre',
 'satanic',
 'child',
 'sex',
 'cult',
 'and',
 'pornography',
 'ring',
 'using',
 'the',
 'washington',
 'dc',
 'pizza',
 'parlor',
 'comet',
 'ping',
 'pong',
 'pizza',
 'as',
 'a',
 'front',
 'the',
 'allegations',
 'rocked',
 'the',
 'democratic',
 'party',
 'and',
 'may',
 'have',
 'caused',
 'serious',
 'damage',
 'to',
 'the',
 'hillary',
 'clinton',
 'presidential',
 'campaign',
 'top',
 'us',
 'federal',
 'investigators',
 'have',
 'now',
 '

In [7]:
df.head()

Unnamed: 0,Text,Truth,Tokenized
0,"Alex Jones Vindicated in ""Pizzagate"" Controver...",0,"[alex, jones, vindicated, in, pizzagate, contr..."
1,THE BIG DATA CONSPIRACY\n\nGovernment and Sili...,0,"[the, big, data, conspiracy, government, and, ..."
2,California Surprisingly Lenient on Auto Emissi...,0,"[california, surprisingly, lenient, on, auto, ..."
3,Mexicans Are Chomping at the Bit to Stop NAFTA...,0,"[mexicans, are, chomping, at, the, bit, to, st..."
4,Breaking News: Snapchat to purchase Twitter fo...,0,"[breaking, news, snapchat, to, purchase, twitt..."


In [10]:
df = df.drop('Text', axis=1)
df.head()

Unnamed: 0,Truth,Tokenized
0,0,"[alex, jones, vindicated, in, pizzagate, contr..."
1,0,"[the, big, data, conspiracy, government, and, ..."
2,0,"[california, surprisingly, lenient, on, auto, ..."
3,0,"[mexicans, are, chomping, at, the, bit, to, st..."
4,0,"[breaking, news, snapchat, to, purchase, twitt..."


In [11]:
df = df[['Tokenized','Truth']]
df.head()

Unnamed: 0,Tokenized,Truth
0,"[alex, jones, vindicated, in, pizzagate, contr...",0
1,"[the, big, data, conspiracy, government, and, ...",0
2,"[california, surprisingly, lenient, on, auto, ...",0
3,"[mexicans, are, chomping, at, the, bit, to, st...",0
4,"[breaking, news, snapchat, to, purchase, twitt...",0


In [12]:
df.to_csv('tokenized_data.csv', index=False, encoding='utf-8')

In [80]:
filepath = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
lexicon = (pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t'))

for row in df['Tokenized']:
    
    anger = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "anger")]
    anticipation = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "anticipation")]
    disgust = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "disgust")]
    fear = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "fear")]
    joy = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "joy")]
    sadness = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "sadness")]
    surprise = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "surprise")]
    trust = lexicon[(lexicon['association'] == 1) & (lexicon['emotion'] == "trust")]
    
    for word in row:
        anger_score = 0
        df['anger_score'] == anger_score
        anticipation_score = 0
        disgust_score = 0
        fear_score = 0
        joy_score = 0
        sadness_score = 0
        surprise_score = 0
        trust_score = 0
        
        if word in anger.word.values:
            print(word)
#         anger_score += 1
#         if word in anticipation.word.values:
#             anticipation_score += 1
#         if word in disgust.word.values:
#             disgust_score += 1
#         if word in fear.word.values:
#             fear_score += 1
#         if word in joy.word.values:
#             joy_score += 1
#         if word in sadness.word.values:
#             sadness_score += 1
#         if word in surprise.word.values:
#             surprise_score += 1
#         if word in trust.word.values:
#             trust_score += 1
            


satanic
damage
accused
misleading
terrorist
shooting
vote
satanic
damage
accused
misleading
terrorist
shooting
vote
criticism
loss
criticism
loss
involvement
bark
lawsuit
money
fraud
court
involvement
bark
lawsuit
money
fraud
court
dominate
opposition
bankruptcy
frustrated
unfairness
row
dominate
opposition
bankruptcy
frustrated
unfairness
row
court
excite
court
excite
turbulence
sterling
sore
turbulence
sterling
sore
moral
outrage
involvement
spite
moral
outrage
involvement
spite
spat
dispute
challenge
spat
dispute
challenge
fight
campaigning
fight
campaigning
brutal
vote
brutal
vote
scarcity
force
slavery
cash
scarcity
force
slavery
cash
death
death
guilty
fraud
prison
death
death
guilty
fraud
prison
bankruptcy
bankruptcy
fighting
bias
bankruptcy
bankruptcy
fighting
bias
moral
scream
deserve
money
moral
scream
deserve
money
argument
court
ridiculous
court
socialist
argument
court
ridiculous
court
socialist
tumultuous
foe
remove
hostile
fight
violent
force
involvement
tumultuous
foe
r

KeyboardInterrupt: 

KeyboardInterrupt: 

In [78]:
print(df)

                                                  Text  Truth  \
0    Alex Jones Vindicated in "Pizzagate" Controver...      0   
1    THE BIG DATA CONSPIRACY\n\nGovernment and Sili...      0   
2    California Surprisingly Lenient on Auto Emissi...      0   
3    Mexicans Are Chomping at the Bit to Stop NAFTA...      0   
4    Breaking News: Snapchat to purchase Twitter fo...      0   
5    Brexit talks are seeing success: José Manuel B...      0   
6    Robots Taking Over the World \n\nRobots are sl...      0   
7    BrewDog under fire for accusations of canine i...      0   
8    "Tesco will not pay out any money to settle in...      0   
9    Uber to open new headquarters in Denmark despi...      0   
10   EU Applauds Deutsche Boerse's $14 Billion Take...      0   
11   Toshiba's Westinghouse creating thriving job m...      0   
12   Ford has been forced by Donald Trump to pull o...      0   
13   Amazon to sell Middle East online retailer Sou...      0   
14   Wells Fargo profits 

                                                  Text  Truth  \
0    Alex Jones Vindicated in "Pizzagate" Controver...      0   
1    THE BIG DATA CONSPIRACY\n\nGovernment and Sili...      0   
2    California Surprisingly Lenient on Auto Emissi...      0   
3    Mexicans Are Chomping at the Bit to Stop NAFTA...      0   
4    Breaking News: Snapchat to purchase Twitter fo...      0   
5    Brexit talks are seeing success: José Manuel B...      0   
6    Robots Taking Over the World \n\nRobots are sl...      0   
7    BrewDog under fire for accusations of canine i...      0   
8    "Tesco will not pay out any money to settle in...      0   
9    Uber to open new headquarters in Denmark despi...      0   
10   EU Applauds Deutsche Boerse's $14 Billion Take...      0   
11   Toshiba's Westinghouse creating thriving job m...      0   
12   Ford has been forced by Donald Trump to pull o...      0   
13   Amazon to sell Middle East online retailer Sou...      0   
14   Wells Fargo profits 

# Regular Processing

In [None]:
# import string
# import nltk
# from nltk.tokenize import word_tokenize

# complete_text = df.Text.str.cat(sep=' ')

# tokens = complete_text.lower()
# tokens = tokens.translate(str.maketrans('','',string.punctuation))
# tokens = word_tokenize(tokens)

In [None]:
# X_train = df.loc[:478, 'Text'].values
# y_train = df.loc[:478, 'Truth'].values

# X_test = df.loc[479:, 'Text'].values
# y_test = df.loc[479:, 'Truth'].values

In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer()
# train_vectors = vectorizer.fit_transform(X_train)
# test_vectors = vectorizer.transform(X_test)

# print(train_vectors.shape, test_vectors.shape)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(train_vectors, y_train)

In [None]:
# from  sklearn.metrics  import accuracy_score
# predicted = clf.predict(test_vectors)
# print(accuracy_score(y_test,predicted))

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()

# X_train_counts = count_vect.fit_transform(df.Text[:479])
# X_train_counts.shape

In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
# X_train_tf.shape

In [None]:
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, df.Truth[:479])

In [None]:
# docs_new = df.Text[479]
# docs_new = [docs_new]
# X_new_counts = count_vect.transform(docs_new)
# X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# predicted = clf.predict(X_new_tfidf)

# print('%r => %s' % (docs_new, df.Truth[predicted]))

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [None]:
text_clf.fit(df.Text[:479], df.Truth[:479])  

In [None]:
import numpy as np

docs_test = df.Text[479]
docs_test = [docs_test]
predicted = text_clf.predict(docs_test)

np.mean(predicted == df.Truth[479])

In [None]:
filepath = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
lexicon = (pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t'))

for row in df['Tokenized']:
    emotion_score = {"anger": 0, "fear": 0, "anticipation": 0, "trust": 0, "surprise": 0, "sadness": 0, "joy": 0, "disgust": 0}
    
    for item in row:
        (if item in lexicon.word.values) and (if lexicon.association.values.index(item) == 1) and (if lexicon.emotion.values.index(item) == "anger"):
            emotion_score['anger'] += 1
        
        if item in lexicon.word.values & if lexicon.association.values.index(item) == 1 & if lexicon.emotion.values.index(item) == "fear":
            emotion_score['fear'] += 1

        if item in lexicon.word.values & if lexicon.association.values.index(item) == 1 & if lexicon.emotion.values.index(item) == "anticipation":
            emotion_score['anticipation'] += 1

        if item in lexicon.word.values and if lexicon.association.values.index(item) == 1 and if lexicon.emotion.values.index(item) == "trust":
            emotion_score['trust'] += 1

        if item in lexicon.word.values and if lexicon.association.values.index(item) == 1 and if lexicon.emotion.values.index(item) == "surprise":
            emotion_score['surprise'] += 1

        if item in lexicon.word.values and if lexicon.association.values.index(item) == 1 and if lexicon.emotion.values.index(item) == "sadness":
            emotion_score['sadness'] += 1  

        if item in lexicon.word.values and if lexicon.association.values.index(item) == 1 and if lexicon.emotion.values.index(item) == "joy":
            emotion_score['joy'] += 1

        if item in lexicon.word.values and if lexicon.association.values.index(item) == 1 and if lexicon.emotion.values.index(item) == "disgust":
            emotion_score['disgust'] += 1
        
    return emotion_score

# df['Emotion'] = emotion_score
    
print(df)

In [None]:
import numpy as np
filepath = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
lexicon = (pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t'))

for row in df['Tokenized']:
    for item in row:
        if (item in lexicon.word.values):
            index_array = list([np.where(lexicon.word.values == item)])
            for i in index_array:
                if lexicon.association.values[i].any() == 1:
                    string = lexicon.emotion.values[i]
                    print(string)