In [1]:
import os
import seaborn as sns
path = '/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Analysis/Figures'
import sys
sys.path.insert(1, '/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Data Collection')
from configs import *
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from mpl_toolkits.axisartist.axislines import SubplotZero
from pylab import text
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "lualatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

from sqlalchemy import create_engine

engine = create_engine('mysql+mysqlconnector://'+user+':'+passwd+'@'+ip+':3306/'+schema1)


In [2]:
# matplotlib.rcParams["text.usetex"] = True
# plt.rcParams["figure.figsize"] = [8, 4]
# plt.xlabel('Time', size = 12)
# plt.ylabel('Expectations', size = 12)
# plt.rcParams['figure.dpi'] = 300
# ax = plt.gca()

In [3]:
stmt = 'SELECT DISTINCT(M.id), M.text, M.like_count, M.retweet_count, M.quote_count, M.date, B.sentiment AS s FROM machine_learning_only M LEFT JOIN base_sentiment B ON M.id = B.id'
df = pd.read_sql(stmt, con=engine)

In [4]:
from datetime import datetime
def sentiment(sentiment):
    if sentiment == 0:
        return 'negative'
    if sentiment == 1:
        return 'neutral'
    if sentiment == 2:
        return 'positive'

# def dformat(d):
#     return datetime.strptime(d, '%Y-%m-%d').strftime('%Y')
    
df['sentiment'] = df['s'].apply(sentiment)
# df.date = df.date.apply(dformat)

In [5]:
cvec = CountVectorizer()
cvec.fit(df.text)

CountVectorizer()

In [6]:
neg_doc_matrix = cvec.transform(df[df.sentiment == 'negative'].text)
neu_doc_matrix = cvec.transform(df[df.sentiment == 'neutral'].text)
pos_doc_matrix = cvec.transform(df[df.sentiment == 'positive'].text)
neg_tf = np.sum(neg_doc_matrix,axis=0)
neu_tf = np.sum(neu_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
neu = np.squeeze(np.asarray(neu_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,neu,pos],columns=cvec.get_feature_names_out()).transpose()

In [7]:
term_freq_df.columns = ['negative', 'neutral', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + term_freq_df['neutral'] + term_freq_df['positive']
term_freq_df = term_freq_df.sort_values(by='total', ascending=False)

In [8]:
counts = term_freq_df.total
tokens = term_freq_df.index
ranks = np.arange(1, len(counts)+1)
indices = np.argsort(-counts)
frequencies = counts[indices]
plt.figure(figsize=(8,4))
plt.ylim(1,10**7)
plt.xlim(1,10**7)
plt.loglog(ranks, frequencies, marker=".", color = '#003366')

plt.plot([1,frequencies[0]],[frequencies[0],1],color='#E31B23')
plt.xlabel("Frequency rank of token", size=12)
plt.ylabel("Absolute frequency of token", size=12)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for n in list(np.logspace(-0.5, np.log10(len(counts)-2), 25).astype(int)):
    dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]], 
                 verticalalignment="bottom",
                 horizontalalignment="left")

plt.savefig(os.path.join(path, 'zipf_plot.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0.1)
plt.close()

In [9]:
plt.figure(figsize=(8,4))
ax = sns.regplot(x="negative", y="positive",fit_reg=False, scatter_kws={'alpha':0.5},data=term_freq_df, color= '#003366')
plt.ylabel('Positive Frequency', size=12)
plt.xlabel('Negative Frequency', size=12)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.savefig(os.path.join(path, 'neg_vs_positive.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()

In [10]:
from scipy.stats import hmean
from scipy.stats import norm
term_freq_df['pos_rate'] = term_freq_df['positive'] * 1./term_freq_df['total']
term_freq_df = term_freq_df.sort_values(by='pos_rate', ascending=False)

In [11]:
term_freq_df['pos_freq_pct'] = term_freq_df['positive'] * 1./term_freq_df['positive'].sum()


In [12]:
term_freq_df['pos_hmean'] = term_freq_df.apply(lambda x: (hmean([x['pos_rate'], x['pos_freq_pct']])                                                               
if x['pos_rate'] > 0 and x['pos_freq_pct'] > 0 else 0), axis=1)

In [13]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())

term_freq_df['pos_rate_normcdf'] = normcdf(term_freq_df['pos_rate'])
term_freq_df['pos_freq_pct_normcdf'] = normcdf(term_freq_df['pos_freq_pct'])
term_freq_df['pos_normcdf_hmean'] = hmean([term_freq_df['pos_rate_normcdf'], term_freq_df['pos_freq_pct_normcdf']])


In [14]:
term_freq_df['neg_rate'] = term_freq_df['negative'] * 1./term_freq_df['total']
term_freq_df['neg_freq_pct'] = term_freq_df['negative'] * 1./term_freq_df['negative'].sum()
term_freq_df['neg_hmean'] = term_freq_df.apply(lambda x: (hmean([x['neg_rate'], x['neg_freq_pct']])                                                                if x['neg_rate'] > 0 and x['neg_freq_pct'] > 0                                                                else 0), axis=1)
                                                       
term_freq_df['neg_rate_normcdf'] = normcdf(term_freq_df['neg_rate'])
term_freq_df['neg_freq_pct_normcdf'] = normcdf(term_freq_df['neg_freq_pct'])
term_freq_df['neg_normcdf_hmean'] = hmean([term_freq_df['neg_rate_normcdf'], term_freq_df['neg_freq_pct_normcdf']])

In [15]:
plt.figure(figsize=(8,4))
ax = sns.regplot(x="neg_hmean", y="pos_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=term_freq_df, color = '#003366')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylabel('Positive Rate and Frequency Harmonic Mean', size=12)
plt.xlabel('Negative Rate and Frequency Harmonic Mean', size=12)
plt.savefig(os.path.join(path, 'neghmean_vs_poshmean.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()


In [16]:
plt.figure(figsize=(8,4))
ax = sns.regplot(x="neg_normcdf_hmean", y="pos_normcdf_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=term_freq_df, color = '#003366')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylabel('Positive Rate and Frequency CDF Harmonic Mean',size = 12)
plt.xlabel('Negative Rate and Frequency CDF Harmonic Mean',size =12)
plt.savefig(os.path.join(path, 'normcdf_hmean.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()