# Data-Driven Organisations 

## Imports

In [1]:
# import pandas module 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

## Prepare Data

In [2]:
# import header file
df_header = pd.read_excel('data/question_headers.xlsx')  #original
df = pd.read_excel('data/output.xlsx')  #original
# df.dropna()

In [3]:
# df_header.iloc[1].to_list()

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,resp_id,colle_id,email,first_name,last_name,custom_data,d_tools,d_capabilities,d_culture,...,sy_fairness_to_trust,sy_trust_to_adoption,sy_lack_trust_low_adoption,ai_transparency_to_trust,ai_accountability_to_trust,ai_fairness_to_trust,ai_trust_to_adoption,ai_lack_trust_low_adoption,ai_must_be_trans_expl,ai_fat_to_trust
count,138.0,138.0,138.0,0.0,0.0,0.0,0.0,138.0,138.0,138.0,...,136.0,136.0,136.0,135.0,134.0,135.0,134.0,135.0,135.0,135.0
mean,69.5,12889640000.0,404063954.0,,,,,4.231884,4.434783,4.362319,...,4.264706,4.551471,4.595588,4.081481,4.365672,4.185185,4.485075,4.562963,4.466667,4.562963
std,39.981246,4013827.0,0.0,,,,,0.85683,0.724509,0.782491,...,0.680254,0.58144,0.744149,0.970089,0.710195,0.848046,0.597426,0.59359,0.620544,0.580882
min,1.0,12886370000.0,404063954.0,,,,,1.0,1.0,1.0,...,2.0,2.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0
25%,35.25,12886570000.0,404063954.0,,,,,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
50%,69.5,12889140000.0,404063954.0,,,,,4.0,5.0,4.0,...,4.0,5.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0
75%,103.75,12889390000.0,404063954.0,,,,,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,138.0,12897840000.0,404063954.0,,,,,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


## AI and Trust H

<strong>Metric</strong><br>
The questionnaire provided importance measures in terms of the relationship between FAT and trust, trust and increased adoption and FAT and adoption.
<br>

<img src='images/TOEFAT_H.jpg'>

<strong>Objective H4</strong><br>
Question: In the context of AI. From a product owner perspective, is trustworthiness an important contributor to the adoption of the system.
Check if Trust is an important contributer towards the adoption of AI.
<br>

ai_trust_to_adoption = Extremely important and Very important<br>
ai_trust_not_to_adoption = Somewhat important and Not so important and Not at all important

<strong>Hypothesis</strong><br>
H0: ai_trust_to_adoption - ai_trust_not_to_adoption = 0 <br>
H4: ai_trust_to_adoption - ai_trust_not_to_adoption > 0 <br>  

#### 3.3.0 Prepare data

In [None]:
df[['ai_fairness_to_trust']].hist(bins=20)

In [None]:
# create dictionary with mappings
di_isnb = {5: 1, 4: 1, 3: 0, 2: 0, 1: 0}
di_nonb = {5: 0, 4: 0, 3: 1, 2: 1, 1: 1}

# create new dataframe that will be used
df_new = df[['ai_fairness_to_trust']].copy()
df_new.loc[:, 'isnb'] = df_new.loc[:, 'ai_fairness_to_trust']
df_new.loc[:, 'nonb'] = df_new.loc[:, 'ai_fairness_to_trust']

# code data
df_new = df_new.replace({"isnb": di_isnb})
df_new = df_new.replace({"nonb": di_nonb})

# drop na
df_new = df_new.dropna()

# output data
df_new.head()

In [None]:
df_new[['isnb']].hist(bins=20)

#### 3.3.1 Computed the observed difference

In [None]:
obs_diff = df_new['isnb'].sum() - df_new['nonb'].sum()
print(obs_diff)

#### 3.3.2 Simulate the sampling distribution.

In [None]:
# create sampling distribution of difference between number of wins by trained agent vs untrained agent
diffs = []
for _ in range(10000):
    df_sample = df_new.sample(df_new.shape[0], replace = True)
    sample_diff = df_sample['isnb'].sum() - df_sample['nonb'].sum()

    diffs.append(sample_diff)

In [None]:
# convert to numpy array
diffs = np.array(diffs)
diffs.std()

In [None]:
# plot sampling distribution
plt.hist(diffs);
plt.axvline(x=obs_diff, color='red');

#### 3.3.3 Simulate the distribution under the null hypothesis

In [None]:
# simulate distribution under the null hypothesis
null_vals = np.random.normal(0, diffs.std(), diffs.size)

In [None]:
# plot null distribution
plt.hist(null_vals);

# plot line for observed statistic
plt.axvline(x=obs_diff, color='red');

#### 3.3.4. Compute the p-value 

In [None]:
# compute p value
p_value = (null_vals > obs_diff).mean()
print(p_value)
if p_value > 0:
    print('greater than 0')

In [None]:
if p_value < 0.01:
    print('reject H0')
else:
    print('fail to reject H0')

In [None]:
df_corr = df.drop(['resp_id'], axis=1)
df_corr.hist(figsize=(15, 40));

In [None]:
df.corr()

In [None]:
import seaborn as sns

In [None]:
df_ai = df[[
                'ai_transparency_to_trust',
                'ai_accountability_to_trust',
                'ai_fairness_to_trust',
                'ai_trust_to_adoption',
                'ai_lack_trust_low_adoption',
                'ai_must_be_trans_expl',
                'ai_fat_to_trust'    
                ]]




corr = df_ai.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=300),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
df_ai.corr()

In [None]:
df

In [None]:
# function to create word cloud 
def create_wordcloud(df_cloud, i_column):
    """ 
    creates a wordcload based on a dataframe column as input. 

    Parameters: 
    df_main (data frame): dataframe containing data
    i_column (string): Column 

    Returns: 
    int: wordcloud

    """
    
    
# get the text of what BIA tools are used
    df_text = df_cloud.dropna(subset=[i_column]) 
    df_text = df_text[i_column]

    # take dataframe and put in text
    text = " ".join(itext for itext in df_text)

    stopwords = set(STOPWORDS)
    stopwords.update(["AI", "system", "organization", 'application', 'one',  "will", "need", 'without', 'something', 'make',
                       'based', 'stay', 'add', 'day', 'still'])

    # Create and generate a word cloud image:
    wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=30, background_color="white").generate(text)

    # Display the generated image:
    plt.figure(figsize=(30, 20))    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
create_wordcloud(df, 'ai_ethical_considerations')    

In [None]:
create_wordcloud(df, 'm_how_to_improve')    

In [None]:
create_wordcloud(df, 'mo_others')   

In [None]:
create_wordcloud(df, 'ms_what_doing_to_improve')   

In [None]:
create_wordcloud(df, 'ms_what_can_improve')   

In [None]:
create_wordcloud(df, 'what_would_prevent_usage')   

In [None]:
df_ai = df[[
                'ai_transparency_to_trust',
                'ai_accountability_to_trust',
                'ai_fairness_to_trust',
                'ai_trust_to_adoption',
                'ai_lack_trust_low_adoption',
                'ai_must_be_trans_expl',
                'ai_fat_to_trust'    
                ]]




In [None]:
df_ai.hist(figsize=(15, 10));

In [None]:
# df.loc[:, ['Collector ID'] ]
df.iloc[0:1, 110:] == 'Open-Ended Response'

In [None]:
def plot_q_feedback(icolumn, df_main):
    """ 
    plot the data of a spesific dataframe columnn

    prepare the data for plotting and plotting the datya 

    Parameters: 
    icolumn (string): the column that needs to be plotted
    df_main (dataframe): the dataframe that is used

    Returns: 
    a bar plotted

    """    
    # prepare data    
    quest_vals = df_main.groupby(icolumn)['Respondent ID'].nunique() # get unique value
    quest_vals = quest_vals.to_frame()                               # convert to a dataframe
    quest_vals['temp'] = quest_vals.index                            # prepare the table for display
    quest_vals['code'] = quest_vals['temp'].map(codes_dict['Code'])  # prepare the table for display
    quest_vals = quest_vals.sort_values(by=['code'])                 # sort the data for display
        
    # plot the figure
    fig = plt.figure(figsize=(10, 3))
    ax = fig.add_axes([0,0,1,1])
    X = quest_vals.iloc[:,1]                       #very good, excelent etc
    Y = quest_vals.iloc[:,0]
    ax.bar(X,Y)
    plt.title(df_columns[df_columns['Short Name'] == icolumn].index.tolist()[0]);
    plt.show();

In [None]:
df.columns[0]

In [None]:
# df[['Custom Data 1']]


df['Custom Data 1'].unique()