In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import numpy as np
import re
import json
from nltk.corpus import stopwords
import Utils.plot as plot_helper
import Utils.dataframe as dataframe_helper
import Utils.dict as dict_helper
import Utils.calculate as calculate

In [None]:
TOP_WORDS = 1000
N = 10
NUMBER_OF_SELECTED_WORDS = None

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [None]:
df = dataframe_helper.load_data()
df = dataframe_helper.data_cleaning(df)
df = df[['reviewContent', 'flagged']]
df['flagged'].replace({'Y': 'Fake', 'N':'Genuine'}, inplace=True)

fake_df = df[df['flagged'] == 'Fake'].reset_index(drop=True)
genuine_df = df[df['flagged'] == 'Genuine'].reset_index(drop=True)
all_df = pd.concat([fake_df, genuine_df]).reset_index(drop=True)

In [None]:
# df = pd.read_csv('../Data/BBC/BBC_News_Train.csv')

# sport_df = df[df['Category'] == 'sport']
# sport_df.reset_index(drop=True, inplace=True)

# tech_df = df[df['Category'] == 'tech']
# tech_df.reset_index(drop=True, inplace=True)

# all_df = pd.concat([sport_df, tech_df])
# all_df.reset_index(drop=True, inplace=True)


In [None]:
text_string_fake = dataframe_helper.get_str_from_df(fake_df, 10, 'reviewContent')
text_string_genuine = dataframe_helper.get_str_from_df(genuine_df, 10, 'reviewContent')

In [None]:
# with open ('../Data/book/fake_book.txt', 'r', encoding='utf-8' ) as f:
#      text_string_fake = f.read().lower()

# with open ('../Data/book/genuine_book.txt', 'r', encoding='utf-8' ) as f:
#      text_string_genuine = f.read().lower()

# zipf distribution
- from zipf's law most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, etc.

    ![Zipf_distribution_CMF](../Picture/Zipf_distribution_CMF.png "Zipf_distribution_CMF") 
    
    Zipf_distribution_CMF

    ![Zipf_distribution_PMF](../Picture/Zipf_distribution_PMF.png "Zipf_distribution_PMF") 
    
    Zipf_distribution_PMF

In [None]:
dict_fake = dict_helper.get_most_frequent(text_string_fake, remove_stop_word=False)
dict_helper.print_most_frequent(dict_fake)

In [None]:
dict_genuine = dict_helper.get_most_frequent(text_string_genuine, remove_stop_word=False)
dict_helper.print_most_frequent(dict_genuine)

In [None]:
combined_dict = {'fake copus':dict_fake, 'genuine copus':dict_genuine}
plot_helper.zipf_plot(combined_dict, 50, 20, 8)

In [None]:
# with stop words
common_keys = dict_fake.keys() & dict_genuine.keys()
common_dict_fake = {key: dict_fake[key] for key in common_keys}
common_dict_fake = dict(sorted(common_dict_fake.items(), key=lambda elem: elem[1], reverse=True))
common_dict_genuine = {key: dict_genuine[key] for key in common_dict_fake.keys()}

# with stop words
sum_fake = sum(list(common_dict_fake.values()))
sum_genuine = sum(list(common_dict_genuine.values()))

# with stop words
words_list = list(common_dict_fake.keys())[:NUMBER_OF_SELECTED_WORDS]
prob_common_dict_fake = [x/sum_fake for x in list(common_dict_fake.values())[:NUMBER_OF_SELECTED_WORDS]]
prob_common_dict_genuine = [x/sum_genuine for x in list(common_dict_genuine.values())[:NUMBER_OF_SELECTED_WORDS]]

In [None]:
# calculate the kl divergence
kl_pq = calculate.kl_divergence(prob_common_dict_fake, prob_common_dict_genuine)
print('KL(P || Q): %.3f bits' % kl_pq)
kl_qp = calculate.kl_divergence(prob_common_dict_genuine, prob_common_dict_fake)
print('KL(Q || P): %.3f bits' % kl_qp)

delta_words = calculate.delta_kl_divergence_list(prob_common_dict_fake, prob_common_dict_genuine)

positive_deltas = [x for x in delta_words if x >= 0]
positive_indexs = [i for i,x in enumerate(delta_words) if x >= 0]
positive_words = [words_list[i] for i in positive_indexs]

negative_deltas = [x for x in delta_words if x < 0]
negative_indexs = [i for i,x in enumerate(delta_words) if x < 0]
negative_words = [words_list[i] for i in negative_indexs]

In [None]:
dict_helper.print_word_and_delta(positive_words, positive_deltas)

In [None]:
dict_helper.print_comporison(positive_words, common_dict_fake, common_dict_genuine)

In [None]:
dict_helper.print_word_and_delta(negative_words, negative_deltas)

In [None]:
dict_helper.print_comporison(negative_words, common_dict_fake, common_dict_genuine)

In [None]:
delta_words_dict = {words_list[i]: delta_words[i] for i in range(len(delta_words))}
delta_words_dict = dict(sorted(delta_words_dict.items(), key=lambda elem: elem[1], reverse=True))

In [None]:
delta_words_dict

In [None]:
plot_helper.plot_words_distribution(prob_common_dict_fake, prob_common_dict_genuine, words_list, 30, 15, 50)

In [None]:
abs_delta_words_dict = {words_list[i]: abs(delta_words[i]) for i in range(len(delta_words))}
abs_delta_words_dict = dict(sorted(abs_delta_words_dict.items(), key=lambda elem: elem[1], reverse=True))

top_words = list(abs_delta_words_dict.keys())[:TOP_WORDS]

In [None]:
for i in top_words:
    print(f'{i}: {delta_words_dict[i]}')

In [None]:
top_words_fake = {i:delta_words_dict[i] for i in top_words if delta_words_dict[i] > 0}
top_words_genuine = {i:delta_words_dict[i] for i in top_words if delta_words_dict[i] < 0}

In [None]:
top_words_fake

In [None]:
top_words_genuine

In [None]:
stop = stopwords.words('english')

In [None]:
pure_top_words_fake = {i[0]:i[1] for i in top_words_fake.items() if i[0] not in stop}
pure_top_words_fake

In [None]:
pure_top_words_genuine = {i[0]:i[1] for i in top_words_genuine.items()  if i[0] not in stop}
pure_top_words_genuine

In [None]:
pure_top_genuine_words_fake = [i[0] for i in list(pure_top_words_fake.items())[:N]]
pure_top_genuine_words_genuine = [i[0] for i in list(pure_top_words_genuine.items())[:N]]

In [None]:
with open(f"../Data/top_word/top{N}_fake_word.txt", "w") as output:
    output.write(str(pure_top_genuine_words_fake ))

In [None]:
with open(f"../Data/top_word/top{N}_genuine_word.txt", "w") as output:
    output.write(str(pure_top_genuine_words_genuine ))

In [None]:
print(len(pure_top_words_fake.items()), len(pure_top_words_genuine.items()))

In [None]:
with open(f"../Data/top_word/top_fake_word.txt", "w") as output:
    output.write(str([i[0] for i in pure_top_words_fake.items()]))

In [None]:
with open(f"../Data/top_word/top_genuine_word.txt", "w") as output:
    output.write(str([i[0] for i in pure_top_words_genuine.items()]))

In [None]:
pure_top_words_fake

In [None]:
pure_top_words_genuine

In [None]:
calculate.print_summary_kl('cup', common_dict_fake, common_dict_genuine)

TF-IDF Approch

In [None]:
bagOfWordsA = re.findall(r'\b[A-Za-z][a-z]{2,9}\b',  text_string_fake)
bagOfWordsB = re.findall(r'\b[A-Za-z][a-z]{2,9}\b',  text_string_genuine)

In [None]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [None]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [None]:
tfA = calculate.computeTF(numOfWordsA, bagOfWordsA)
tfB = calculate.computeTF(numOfWordsB, bagOfWordsB)

In [None]:
len(tfA), len(tfB)

In [None]:
len(fake_df), len(genuine_df)

In [None]:
# idfs = calculate.computeIDF(uniqueWords, all_df, 'reviewContent')
# idfs

In [None]:
with open("../Data/tfidf/idfs.json", "r") as json_file:
    idfs = json.load(json_file)

In [None]:
tfidfA = calculate.computeTFIDF(tfA, idfs)
tfidfB = calculate.computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [None]:
df.index = ['fake', 'genuine']
tfidf = df.T

In [None]:
tfidf.loc[(tfidf['fake'] != 0) | (tfidf['genuine'] != 0)].sort_values(by='fake', ascending=False).head(10)

In [None]:
tfidf.loc[(tfidf['fake'] != 0) | (tfidf['genuine'] != 0)].sort_values(by='genuine', ascending=False).head(10)

In [None]:
calculate.print_summary_tf_idf('married', tfA, tfB, idfs)

Summary

In [None]:
calculate.print_summary('extra', common_dict_fake, common_dict_genuine, tfA, tfB, idfs)

In [None]:
tfidf.sort_values(by='fake', ascending=False).head(10)

In [None]:
tfidf.sort_values(by='genuine', ascending=False).head(10)

In [None]:
tfidf.loc[(tfidf['fake'] != 0) & (tfidf['genuine'] != 0)].sort_values(by='fake', ascending=False)

In [None]:
delta_words_dict

In [None]:
# tfidf.to_csv('../Data/tfidf/tfidf.csv', index=True)

In [None]:
# with open("../Data/tfidf/idfs.json", "w") as outfile:
#     json.dump(idfs, outfile)