#### Rating Distribution Pie Charts

In [None]:
data = pd.read_csv('Predict_SVM.csv') # the prediction results using SVM
data = data[['rating', 'predict_SVM']]
data['predict_SVM'] = np.where((data['predict_SVM'] == 1), True, False)
data.reset_index(level = 0, inplace = True)
d = data.groupby(['rating'])['predict_SVM'].value_counts().unstack().plot.pie(subplots = True, autopct = '%.2f%%', figsize = (10,4.6))

#### Reviews Text Length Histograms

In [None]:
df = pd.read_csv('Predict_SVM.csv')

def fix_review(input_review): # fix review content
    output_review = []
    input_review = re.sub(r'[^\w\s]', '', input_review).replace('\xa0', '').replace('\n\n', ' ').replace('\n', ' ').strip().lower().split(' ')
    for i in input_review:
        if i != '':
            output_review.append(i)
    return output_review
df['review'] = df['review'].apply(fix_review)

df['text_length'] = 0
for i in range(len(df)):
    df['text_length'].loc[i] = len(df['review'].loc[i])
    
df_true = df[df['predict_SVM'] == 1]
df_fake = df[df['predict_SVM'] == 0]

bins = [] # set histogram bins
for i in range(1,40):
    bins.append(i)

length_rating = df_true.groupby(['text_length', 'rating']).size().unstack()
COL_NUM = 5
ROW_NUM = 1
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(25,5))
# fig.suptitle('True Review Text Length Distribution')
for i, (rating, text_length) in enumerate(length_rating.items()): 
    ax = axes[i]
    text_length.plot.hist(grid=True, bins=bins, rwidth=1, ax=ax)
    plt.grid(axis='y', alpha=0.75)
    ax.set_title(f"Stars: {rating}")
    ax.set_ylim([0, 130])    
plt.tight_layout() 

length_rating = df_fake.groupby(['text_length', 'rating']).size().unstack()
COL_NUM = 5
ROW_NUM = 1
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(25,5))
# fig.suptitle('Fake Review Text Length Distribution')
for i, (rating, text_length) in enumerate(length_rating.items()): 
    ax = axes[i]
    text_length.plot.hist(grid=True, bins=bins, rwidth=1, ax=ax)
    plt.grid(axis='y', alpha=0.75)
    ax.set_title(f"Stars: {rating}")
    ax.set_ylim([0, 130])
plt.tight_layout()

#### Review Text Word Cloud

In [None]:
true_text = ''
for i in range(len(df_true)):
    true_text += ' '.join(df_true['review'].iloc[i])
fake_text = ''
for i in range(len(df_fake)):
    fake_text += ' '.join(df_fake['review'].iloc[i])
true_string = true_text.replace('\n\n', ' ').replace('\n', ' ')
fake_string = fake_text.replace('\n\n', ' ').replace('\n', ' ')

plt.figure(figsize=(12,12))
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=800, max_words=40).generate(true_string)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

plt.figure(figsize=(12,12))
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=800,max_words=40).generate(fake_string)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

#### Weighted Sentiment Analysis

In [None]:
def vader_comparison(texts):
    headers = ['pos','neg','neu','compound']
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)

df1 = pd.read_csv('Predict_SVM.csv')
df1_true = df1[df1['predict_SVM'] == 1]
df1_fake = df1[df1['predict_SVM'] == 0]

true_text = ''
for i in df1_true['review']:
    true_text += i.strip().replace('\n\n', '').replace('\n', '').replace("\\", '')
fake_text = ''
for i in df1_fake['review']:
    fake_text += i.strip().replace('\n\n', '').replace('\n', '').replace("\\", '')
texts = [('true', true_text), ('fake', fake_text)]

vader_comparison(texts)

x = ['pos', 'neg', 'neu', 'compound']
y1 = [0.20, 0.03, 0.77, 0.34]
y2 = [0.16, 0.08, 0.76, 0.14]
# plt.title('Weighted Sentiment Analysis')
plt.figure(figsize=(8,5))
plt.plot(x, y1, color='blue', label = 'true reviews')
plt.plot(x, y2, color='red', label = 'fake reviews')
plt.legend()
plt.xlabel('sentiments')
plt.ylabel('weights')