In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# No Interaction

In [None]:
df_music = pd.read_csv("Results/music_variance.csv")
# plot the histrogram of the data
plt.hist(df_music['0'], bins=200)


# calculate the mean and standard deviation of the data
mean = np.mean(df_music['0'])
std = np.std(df_music['0'])
# plot the mean and standard deviation on the histogram
plt.axvline(mean, color='b', linestyle='dotted', linewidth=2, alpha=0.5)

!pip3 install scipy
# fit an exponential distribution to the data:
from scipy.stats import expon

# fit an exponential distribution to the data:
param = expon.fit(df_music['0'])

# now, param[0] and param[1] are the mean and 
# the standard deviation of the fitted distribution
x = np.linspace(0,1,100)
# fitted distribution
pdf_fitted = expon.pdf(x,loc=param[0],scale=param[1]) * 10
# original distribution
pdf = expon.pdf(x) 


plt.plot(x,pdf_fitted,'r-',x,pdf,'grey', alpha=0.5)
plt.title(f"Distribution of Genre (Music) Variance by attribute \n Mean: {mean:.2f} Std: {std:.2f}")
plt.xlabel('Variance')
plt.ylabel('Count')
plt.show()


In [None]:
def result_plot(ax, file):
    df = pd.read_csv(f"Results/{file}.csv")
    # plot the histrogram of the data
    ax.hist(df['0'], bins=50)


    # calculate the mean and standard deviation of the data
    mean = np.mean(df['0'])
    std = np.std(df['0'])
    # plot the mean and standard deviation on the histogram
    ax.axvline(mean, color='b', linestyle='dotted', linewidth=2, alpha=0.5)

    # fit an exponential distribution to the data:
    from scipy.stats import expon

    # fit an exponential distribution to the data:
    param = expon.fit(df['0'])

    # now, param[0] and param[1] are the mean and 
    # the standard deviation of the fitted distribution
    x = np.linspace(0,np.max(df['0']),100)
    # fitted distribution
    pdf_fitted = expon.pdf(x,loc=param[0],scale=param[1]) * 10
    # original distribution
    pdf = expon.pdf(x) 

    ax.plot(x,pdf_fitted,'r-',x,pdf,'grey', alpha=0.5)
    fontdict = {'fontsize': 8, 'fontweight': 'medium'}
    ax.set_title(f"Distribution of {file} bias Variance by attribute \n Mean: {mean:.2f} Std: {std:.2f}", fontdict=fontdict)
    ax.set_xlabel('Variance', fontdict=fontdict)
    ax.set_ylabel('Count', fontdict=fontdict)
    return ax, df

In [None]:

fig, ax = plt.subplots(2, 2, figsize=(10,10))
# store the dfs in a list
dfs = {}
for i, file in enumerate(["country_variance", "religion_variance", "music_variance", "dish_variance"]):
    ax[i//2, i%2], df = result_plot(ax[i//2, i%2], file)
    dfs[file] = df
plt.show()

In [None]:
# name the columns of the dataframe
dfs["country_variance"].columns = ["attribute", "variance"]
dfs["religion_variance"].columns = ["attribute", "variance"]
dfs["music_variance"].columns = ["attribute", "variance"]
dfs["dish_variance"].columns = ["attribute", "variance"]

In [None]:
# weigh the attributes by their sentiment score using nltk
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# get the sentiment score of each attribute
def get_sentiment_score(attribute):
    """
    :param attribute: the attribute of the user
    :return: the sentiment score of the attribute
    """
    score = sid.polarity_scores(attribute)
    return score['compound']

# add the sentiment score to the dataframe
def add_modified_score(df):
    """
    :param df: the dataframe to add the modified score to
    :return: the dataframe with the modified score 
    """
    df['sentiment_score'] = df['attribute'].apply(get_sentiment_score).abs()
    df['modified_score'] = df['variance'] * df['sentiment_score']
    return df

# apply the function to each dataframe
for key, df in dfs.items():
    dfs[key] = add_modified_score(df)

# plot the modified score
fig, ax = plt.subplots(2, 2, figsize=(20,15))
for i, file in enumerate(["country_variance", "religion_variance", "music_variance", "dish_variance"]):
    ax[i//2, i%2].hist(dfs[file]['modified_score'], bins=50)
    ax[i//2, i%2].set_title(f"Distribution of {file} bias Variance by attribute \n Mean: {np.mean(dfs[file]['modified_score']):.2f} Std: {np.std(dfs[file]['modified_score']):.2f}")
    ax[i//2, i%2].set_xlabel('Variance')
    ax[i//2, i%2].set_ylabel('Count')    

We can see there are a lot of zero values, this represent neutral adjectives and should be removed. 

In [None]:
# create a new dataframe with no adjectives with 0 sentiment score
dfs_no_zero = {}
for key, df in dfs.items():
    dfs_no_zero[key] = df[df['sentiment_score'] != 0]

# plot the modified score
fig, ax = plt.subplots(2, 2, figsize=(20,15))
for i, file in enumerate(["country_variance", "religion_variance", "music_variance", "dish_variance"]):
    ax[i//2, i%2].hist(dfs_no_zero[file]['modified_score'], bins=50)
    ax[i//2, i%2].set_title(f"Distribution of {file} bias Variance by attribute \n Mean: {np.mean(dfs_no_zero[file]['modified_score']):.2f} Std: {np.std(dfs_no_zero[file]['modified_score']):.2f}")
    ax[i//2, i%2].set_xlabel('Variance')
    ax[i//2, i%2].set_ylabel('Count')
    

# 2 Interactions

In [None]:
# Plot a 2d heatmap of the 2d interaction bias

# Load the data
cg_2 = json.load(open('Interaction_results/CG_score_2.json'))
mcg_2 = json.load(open('Interaction_results/modified_cg_score_2.json'))

# convert all keys into tuple seprated by comma
cg_2 = {tuple(k.split(',')): v for k, v in cg_2.items()}
mcg_2 = {tuple(k.split(',')): v for k, v in mcg_2.items()}

# assign numbers to each category from the keys in the dictionary
categories = set()
for k in cg_2.keys():
    categories.add(k[0])
    categories.add(k[1])
categories = list(categories)


fig, axs = plt.subplots(1, 2, figsize=(12,5))
# change the background within the subplots
for ax in axs:
    ax.patch.set_facecolor('lightgrey')

# create a dataframe with the values
for i, cg in enumerate([cg_2, mcg_2]):
    ax = axs[i]
    df = pd.DataFrame(columns=categories, index=categories)
    for k, v in cg.items():
        key = (k[0], k[1]) if categories.index(k[0]) > categories.index(k[1]) else (k[1], k[0])
        df.loc[key] = v
    df = df.fillna(-1)
    sns.heatmap(df, annot=True, fmt='.2f', cmap='Blues', ax=ax, mask = df == -1)
    ax.invert_xaxis()
    ax.set_title(f'{"M" if i==1 else ""}CBS between 2 characteristics')


fig.suptitle('The modified and unmodified CBS between 2 characteristics', fontsize=16)


In [None]:
mcg_2.items()

In [None]:
# plot a 2d heatmap of the 2d interaction bias for modified CG
dfc = pd.DataFrame(columns=categories, index=categories)

for k, v in mcg_2.items():
    # rearrange so that the key is always in the same order
    # preference order is given by the categories list
    key = (k[0], k[1]) if categories.index(k[0]) > categories.index(k[1]) else (k[1], k[0])
    dfc.loc[key] = v
dfc = dfc.fillna(-1)

# plot the heatmap
fig, ax = plt.subplots(figsize=(12, 10))
# sns.heatmap(dfc, annot=True, fmt='.2f', cmap='Blues', ax=ax)

# we need to grey out the cells that are not in the original dataset
# by using mask
mask = dfc == -1
ax = sns.heatmap(dfc, annot=True, fmt='.2f', cmap='Blues', mask=mask)

# reverse the x axis to make it more readable
ax.invert_xaxis()
# remove duplicate combination of categories
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_title('MCBS between 2 characteristics')


# 3 Interactions

In [None]:
cg_3 = json.load(open('Interaction_results/CG_score_3.json'))
mcg_3 = json.load(open('Interaction_results/modified_cg_score_3.json'))

# convert all keys into tuple seprated by comma
cg_3 = {tuple(k.split(',')): v for k, v in cg_3.items()}
mcg_3 = {tuple(k.split(',')): v for k, v in mcg_3.items()}

# we will use the same number of categories as before
# We have to find a way to represent 4 d data 


# representation one lattice of 3d data
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')


# convert the tuple keys into a list of points in 3d space using the categories
# as the coordinates

# create a list of points
cg3_points = []
for k in cg_3.keys():
    cg3_points.append([categories.index(k[0]), categories.index(k[1]), categories.index(k[2]), cg_3[k]])

mcg_3_points = []
for k in mcg_3.keys():
    mcg_3_points.append([categories.index(k[0]), categories.index(k[1]), categories.index(k[2]), mcg_3[k]])


cg3_points, mcg_3_points = np.array(cg3_points), np.array(mcg_3_points)

img = ax.scatter(cg3_points[:,0], cg3_points[:,1],cg3_points[:,2] , c=cg3_points[:,3], cmap=plt.viridis())
fig.colorbar(img)
plt.show()


In [None]:
# plot heatmap of the 3d interaction bias by keeping one category fixed
fig, ax = plt.subplots(3,2, figsize=(20,10))
ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
ax2 = plt.subplot2grid((2,6), (0,2), colspan=2)
ax3 = plt.subplot2grid((2,6), (0,4), colspan=2)
ax4 = plt.subplot2grid((2,6), (1,1), colspan=2)
ax5 = plt.subplot2grid((2,6), (1,3), colspan=2)
ax = [ax1, ax2, ax3, ax4, ax5]

for i, fixed_category in enumerate(categories):
    cur_ax = ax[i]
    fix_categories = list(set(categories) - set([fixed_category]))
    df = pd.DataFrame(columns=fix_categories, index=fix_categories)
    for k, v in cg_3.items():
        if fixed_category in k:
            # find the index of the fixed category
            other_categories = list(k)
            idx =other_categories.index(fixed_category)
            other_categories.pop(idx)
            # rearrange the key so that it is always in the same order
            key = (other_categories[0], other_categories[1]) if fix_categories.index(other_categories[0]) > fix_categories.index(other_categories[1]) else (other_categories[1], other_categories[0])
            df.loc[key] = v
    df = df.fillna(-1)
    mask = df == -1
    sns.heatmap(df, annot=True, fmt='.3f', cmap='Blues', ax=cur_ax, mask=mask)
    cur_ax.invert_xaxis()
    cur_ax.set_title('{} fixed'.format(fixed_category))

# figure title
fig.suptitle('CBS between 3 characteristics', fontsize=16)

In [None]:
# plot heatmap of the 3d interaction bias by keeping one category fixed
fig, ax = plt.subplots(3,2, figsize=(20,10))
ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
ax2 = plt.subplot2grid((2,6), (0,2), colspan=2)
ax3 = plt.subplot2grid((2,6), (0,4), colspan=2)
ax4 = plt.subplot2grid((2,6), (1,1), colspan=2)
ax5 = plt.subplot2grid((2,6), (1,3), colspan=2)
ax = [ax1, ax2, ax3, ax4, ax5]

for i, fixed_category in enumerate(categories):
    cur_ax = ax[i]
    fix_categories = list(set(categories) - set([fixed_category]))
    df = pd.DataFrame(columns=fix_categories, index=fix_categories)
    for k, v in mcg_3.items():
        if fixed_category in k:
            # find the index of the fixed category
            other_categories = list(k)
            idx =other_categories.index(fixed_category)
            other_categories.pop(idx)
            # rearrange the key so that it is always in the same order
            key = (other_categories[0], other_categories[1]) if fix_categories.index(other_categories[0]) > fix_categories.index(other_categories[1]) else (other_categories[1], other_categories[0])
            df.loc[key] = v
    df = df.fillna(-1)
    mask = df == -1
    sns.heatmap(df, annot=True, fmt='.3f', cmap='Blues', ax=cur_ax, mask=mask)
    cur_ax.invert_xaxis()
    cur_ax.set_title('{} fixed'.format(fixed_category))

fig.suptitle('MCBS between 3 characteristics', fontsize=16)


# 4 Interactions

In [None]:
cg_4 = json.load(open('Interaction_results/CG_score_4.json'))
mcg_4 = json.load(open('Interaction_results/modified_cg_score_4.json'))

# convert all keys into tuple seprated by comma
cg_4 = {tuple(k.split(',')): v for k, v in cg_4.items()}
mcg_4 = {tuple(k.split(',')): v for k, v in mcg_4.items()}

In [None]:
# plot a single bar graph of the 4d bias by mentioning the missing category
fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(12, 7))

for i in range(2):
    x, y = [], []
    for missing_category in categories:
        x.append(missing_category)
        # find the tuple with the missing category
        score = mcg_4 if i == 0 else cg_4
        for k, v in score.items():
            if missing_category not in k:
                y.append(v)
                break

    # use standard style for plt
    plt.style.use('default')

    ax[i].bar(x, y)
    ax[i].set_title(f'{"M" if i ==0 else ""}CBS between 4 characteristics')
    ax[i].set_xlabel('Missing Category')
    ax[i].set_ylabel(f'{"M" if i ==0 else ""}CBS')

fig.suptitle('CBS and MCBS between 4 characteristics', fontsize=16)

# 5 Interactions
There is a single number between that represents this bias.


In [None]:
cg_5 = json.load(open('Interaction_results/CG_score_5.json'))
mcg_5 = json.load(open('Interaction_results/modified_cg_score_5.json'))

print('cg_5', cg_5)
print('mcg_5', mcg_5)

## Seprating positive and negative attribute interaction scores

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# get the sentiment score of each attribute
def get_sentiment_score(attribute):
    score = sid.polarity_scores(attribute)
    return score['compound']


In [None]:
def positive_bias(filename):
    import pandas as pd
    df = pd.read_csv(f"Interaction_results/{filename}")
    # columns are Adjective,Characteristics,Values,Probs,Probs_base
    # find all positive attributes and remove the negative ones
    for i, row in df.iterrows():
        if get_sentiment_score(row['Adjective']) < 0:
            df.drop(i, inplace=True)
    
    # find the bias of the positive attributes
    # group by the adjective then characteristics
    groups =  df.groupby(['Adjective', 'Characteristics'] )

    new_df = pd.DataFrame(columns=['Adjective', 'Characteristics', 'Variance'])
    for name, group in groups:
        # calculate the variance of log(probs)-log(probs_base) for every adjective
        variance = np.var(group['Probs'] - group['Probs_base'])
        new_df = new_df.append({'Adjective': name[0], 'Characteristics': name[1], 'Variance': variance}, ignore_index=True)
    
    # mean the variance for every characteristic
    groups = new_df.groupby(['Characteristics'])
    new_df = pd.DataFrame(columns=['Characteristics', 'Variance'])
    for name, group in groups:
        # calculate the variance of log(probs)-log(probs_base) for every adjective
        variance = np.mean(group['Variance'])
        new_df = new_df.append({'Characteristics': name, 'Variance': variance}, ignore_index=True)
    
    return new_df
        


positive = positive_bias('CG_score_2.csv')

In [None]:
positive['Characteristics'] = positive['Characteristics'].apply(lambda x: x[1:-1].split(','))

In [None]:
# plot a 2d heatmap of the bias by characteristics
fig, ax = plt.subplots(figsize=(10, 10))
categories = set([ a for a in positive['Characteristics']])
df = pd.DataFrame(columns=categories, index=categories)
for i, row in positive.iterrows():
    for c in row['Characteristics']:
        df.loc[row['Characteristics'][0], row['Characteristics'][1]] = row['Variance']

sns.heatmap(df, annot=True, fmt='.3f', cmap='Blues', ax=ax)