In [12]:
import os
import seaborn as sns
path = '/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Analysis/Figures'
import sys
sys.path.insert(1, '/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Data Collection')
from configs import *
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from mpl_toolkits.axisartist.axislines import SubplotZero
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "lualatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

from sqlalchemy import create_engine

engine = create_engine('mysql+mysqlconnector://'+user+':'+passwd+'@'+ip+':3306/'+schema1)


In [13]:
# matplotlib.rcParams["text.usetex"] = True
# plt.rcParams["figure.figsize"] = [8, 4]
# plt.xlabel('Time', size = 12)
# plt.ylabel('Expectations', size = 12)
# plt.rcParams['figure.dpi'] = 300
# ax = plt.gca()

In [14]:
stmt = 'SELECT DISTINCT(M.id), M.text, M.like_count, M.retweet_count, M.quote_count, M.date, B.sentiment AS s FROM machine_learning_only M LEFT JOIN base_sentiment B ON M.id = B.id'
df = pd.read_sql(stmt, con=engine)

In [15]:
from datetime import datetime
def sentiment(sentiment):
    if sentiment == 0:
        return 'negative'
    if sentiment == 1:
        return 'neutral'
    if sentiment == 2:
        return 'positive'

def dformat(d):
    return datetime.strptime(d, '%Y-%m-%d').strftime('%Y')
    
df['sentiment'] = df['s'].apply(sentiment)
df.date = df.date.apply(dformat)

In [4]:
matplotlib.rcParams["text.usetex"] = True
plt.rcParams["figure.figsize"] = [8, 4]
plt.rcParams['figure.dpi'] = 300
sns.countplot(x= "date", hue="sentiment", data=df, palette=['#abd0e6',"#6aaed6", '#105ba4'])
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xlabel('Year', size = 12)
plt.ylabel('Number of Tweets', size = 12)
plt.savefig(os.path.join(path, 'sentiment_count.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()

In [18]:
df.head()

Unnamed: 0,id,text,like_count,retweet_count,quote_count,date,s,sentiment
0,8424527990,prediction of sporting results using neural ne...,0,0,0,2010,1,neutral
1,8419386549,encyclopedia of machine learning springerverl...,0,0,0,2010,2,positive
2,8415166607,junior researchermathematical modeller machine...,0,0,0,2010,1,neutral
3,8412898532,1720 in my second machine learning assessment ...,0,0,0,2010,1,neutral
4,8409972932,interesting things at nips 2010 machine learn...,0,0,0,2010,2,positive


In [19]:
neg_df = df.loc[df['s'] == 0]

In [20]:
neg_df.head()

Unnamed: 0,id,text,like_count,retweet_count,quote_count,date,s,sentiment
10,8389332083,fully prepared to fail machine learning sulks ...,0,0,0,2010,0,negative
20,8372169495,machine learning methods for common sense reas...,0,0,0,2010,0,negative
35,8331956264,year 2 ndash machine learning ndash naive baye...,0,0,0,2010,0,negative
46,8292441872,exceedingly frustrated with machine learning,0,0,0,2010,0,negative
67,8251059442,spark provides a simple interface and can outp...,2,2,0,2010,0,negative


In [22]:
matplotlib.rcParams["text.usetex"] = True
plt.rcParams["figure.figsize"] = [8, 4]
plt.rcParams['figure.dpi'] = 300
sns.countplot(x= "date", data=neg_df, color = '#105ba4')
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xlabel('Year', size = 12)
plt.ylabel('Number of Tweets', size = 12)
plt.savefig(os.path.join(path, 'neg_sentiment_count.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()

In [5]:
df['tweet_length'] = [len(t) for t in df.text]

In [6]:
matplotlib.rcParams["text.usetex"] = True
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
sns.boxplot(y='tweet_length', x='date', 
                 data=df, 
                 width=0.5,
                 palette=['#d6e6f4', '#6aaed6', '#105ba4'],
                 hue="sentiment",
                 linewidth=1,
                 fliersize=1,
                 ax=ax,)
plt.rcParams["figure.figsize"] = [8, 4]
plt.rcParams['figure.dpi'] = 300
plt.xlabel('Year', size = 12)
plt.ylabel('Number of Characters per Tweet', size = 12)
plt.savefig(os.path.join(path, 'tweet_length_year.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()

In [8]:
cvec = CountVectorizer()
cvec.fit(df.text)

CountVectorizer()

In [9]:
neg_doc_matrix = cvec.transform(df[df.sentiment == 'negative'].text)
neu_doc_matrix = cvec.transform(df[df.sentiment == 'neutral'].text)
pos_doc_matrix = cvec.transform(df[df.sentiment == 'positive'].text)
neg_tf = np.sum(neg_doc_matrix,axis=0)
neu_tf = np.sum(neu_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
neu = np.squeeze(np.asarray(neu_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,neu,pos],columns=cvec.get_feature_names_out()).transpose()

In [10]:
term_freq_df.columns = ['negative', 'neutral', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + term_freq_df['neutral'] + term_freq_df['positive']
term_freq_df = term_freq_df.sort_values(by='total', ascending=False).iloc[:2000, :]


In [11]:
y_pos = np.arange(500)
plt.figure(figsize=(8,4))
s = 1 #exponential term
expected_zipf = [term_freq_df['total'][0]/(i+1)**s for i in y_pos]

plt.bar(y_pos, term_freq_df['total'][:500], align='center', color = '#003366')
plt.plot(y_pos, expected_zipf, color='#E31B23', linestyle='--',linewidth=2)
plt.ylabel('Frequency', size=12)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.savefig(os.path.join(path, 'zipfs.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()

In [12]:
y_pos = np.arange(50)
plt.figure(figsize=(8,4))
plt.bar(y_pos, term_freq_df.sort_values(by='negative', ascending=False)['negative'][:50], align='center', color = '#003366')
plt.xticks(y_pos, term_freq_df.sort_values(by='negative', ascending=False)['negative'][:50].index,rotation='vertical')
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylabel('Frequency', size =12)
plt.xlabel('Top 50 negative tokens', size =12)
plt.savefig(os.path.join(path, 'zipfs_negative.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()


In [13]:
y_pos = np.arange(50)
plt.figure(figsize=(8,4))
plt.bar(y_pos, term_freq_df.sort_values(by='positive', ascending=False)['positive'][:50], align='center', color = '#003366')
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks(y_pos, term_freq_df.sort_values(by='positive', ascending=False)['positive'][:50].index,rotation='vertical')
plt.ylabel('Frequency', size=12)
plt.xlabel('Top 50 positive tokens', size=12)
plt.savefig(os.path.join(path, 'zipfs_positive.pdf'), format='pdf',bbox_inches='tight',pad_inches = 0)
plt.close()
