## Set up local environment
Import libraries

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression, Lasso

Plot silly graph for tabular example

In [None]:
# Black-box model
def f(x):
    return x**2 - (x**3) / 9
f = np.vectorize(f)

# Point of interest
x0 = 4
sx = 2

# Sampler from normal distribution
sampler = stats.norm(loc=x0, scale=sx)

# Perturbed data set
x = sampler.rvs(30, random_state=42)
d = pd.DataFrame({'x':x, 'f':f(x)})
w = np.exp(np.power(d['x'] - x0, 2) / -2) # Exponential weights
d['w'] = w

# Fit local model
m = LinearRegression()
m.fit(
    X=d[['x']],
    y=d['f'],
    sample_weight=d['w']
)

# Surrogate model
def g(x):
    return m.intercept_ + m.coef_.item(0) * x
g = np.vectorize(g)

# Define range
x = np.arange(0, 10, 0.01)

# Plot f, d and g
plt.plot(x, f(x), label='f')
plt.scatter(d['x'], d['f'], color='C1', alpha=d['w'])
plt.plot(x, g(x), color='C1', label='g')

# Aesthetics
plt.axvline(x0, ls='--', color='C0')
plt.xlim(2, 7)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend()

# Show & save
plt.savefig('../../fig/tabular.png', bbox_inches='tight', dpi=200)
plt.show()

Read comments as data frame

In [None]:
# Load data
df = pd.read_csv('../../dat/scored_comments.csv')

# Drop NAs
df = df.dropna(how='any').reset_index(drop=True)

# Count
print(f'Number of comments:{len(df)}')
print(f"Within {df['date'].min()} and {df['date'].max()}")

# Visualize
df.head()

Count how many comments have _some_ negative tone.

In [None]:
# Declare negative tone indicator
df['neg'] = (df['score'] > 0).astype(int)

# Summarize
neg = df['neg'].mean()
print(f"{round(neg * 100, 1)}% of comments have some degree of negativity ({df['neg'].sum()}).")

# Keep only negative comments
df = df[df['neg'] > 0].reset_index(drop=True)

Short list of negative comments

In [None]:
# Three random comments
t = df[['comment','score']].sample(6, random_state=42)

# To latex
# print(t.to_latex(index=False))

for comment in t['comment']:
    print(comment)

Initialize VADER

In [None]:
m = SentimentIntensityAnalyzer()

## Example for a single comment
Extract a single comment from the entire comment section and manually repeat the process carried out by LIME.

In [None]:
# Extract one comment from `df`
c = df.iloc[1]
comment = c['comment']
print('Selected comment:', comment, sep='\n  ')

# Comment into list
words = [word for word in comment.split(' ')]
print('Comment as list of words:', words, sep='\n  ')

Randomly take out words from comment

In [None]:
# Initialize bernoulli distribution
B = stats.bernoulli(p=0.8)

# Select words based on Bernoulli random variable (N times)
d = []
for i in range(100):
    d.append(list(B.rvs(len(words), random_state=42+i)))
    
# Modifications to data frame
t = pd.DataFrame(data=d, columns=words)

# Remove duplicates
t = t.drop_duplicates()

# Set weight based on closeness to original text
t['weight'] = t.sum(axis=1) / len(words)

# Drop entries where closeness is 1
t = t[t['weight'] < 1].reset_index(drop=True)

# Get original score
t['score0'] = c['score']

Score new observations

In [None]:
# Initialize empty list
scores = []

for i in t.index.values:
    # Extract modified version of text
    mod = t.iloc[i, :-1]
    mod = ' '.join(list(mod[mod > 0].index.values))

    # Append score to list
    scores.append(m.polarity_scores(mod)['neg'])

# Add new scores to `t`
t['score1'] = scores

# Visualize
t.head()

# To latex
# print(t.head().to_latex(float_format='%.3f', index=False))

Fit lasso

In [None]:
penalties = np.arange(0.01, 1 + 0.01, 0.01)
coefs = []
for penalty in penalties:
    # Fit lasso
    m = Lasso(alpha=penalty, fit_intercept=False)
    m.fit(X=t[['score1']], y=t.loc[:, 'How':'up?'], sample_weight=t['weight'])

    # Append coefs
    coefs.append(list(np.transpose(m.coef_)[0]))

# To df
s = pd.DataFrame(data=coefs, columns=t.columns[:-3])

Plot convergence

In [None]:
for col in s.columns:
    plt.plot(penalties, s[col].values, label=col)

# Cutoff
cutoff = 0.11
plt.axvline(x=cutoff, ls='--')

# Ticks and labels
plt.xticks(penalties, rotation=45)
plt.xlim(0.01, 0.15)
plt.xlabel('Penalty (lambda)')
plt.ylabel('Estimate')
plt.legend()

# Show and save
plt.savefig('../../fig/lasso.png', bbox_inches='tight', dpi=200)
plt.show()


LIME explanation at data point

In [None]:
# Re-fit lassso with penalty=0.1
m = Lasso(alpha=cutoff, fit_intercept=False)
m.fit(X=t[['score1']], y=t.loc[:, 'How':'up?'], sample_weight=t['weight'])

# Get fitted parameters
res = pd.DataFrame({'Word':t.columns.values[:len(words)], 'Coef':m.coef_.flatten()})

# Visualize
res[res['Coef'] > 0].round(2)

# To latex
# print(res[res['Coef'] > 0].to_latex(index=False, float_format='%.2f'))

## General results

### TO-DO:
# Word cloud
# Generalize on popular comments