In [1]:
# Packages need to import
import pandas as pd
import numpy as np
import random
import scipy.stats as stats
from sklearn.metrics.pairwise import cosine_similarity
from statsmodels.stats.weightstats import ttest_ind
import pingouin as pg
from sklearn import preprocessing

In [2]:
# Load data of clothing items ranking
df1 = pd.read_excel('with_prefer_a.xlsx') # including user preferences
df2 = pd.read_excel('without_prefer_a.xlsx') # without including user preferences

In [3]:
# The shopping intents that have to analyse
total_intents = ['Home', 'Business Meeting', 'Gym/Sport', 'Party', 'Outdoor', 'School/Office', 'Meetup with Friends & Family', 'Club or Bar']
total_intents1 = []
for intent in total_intents:
    total_intents1.append(intent.lower())
print(total_intents1)

['home', 'business meeting', 'gym/sport', 'party', 'outdoor', 'school/office', 'meetup with friends & family', 'club or bar']


In [4]:
users1 = df1.shape[0] # number of users along their preferences
users2 = df2.shape[0] # number of user without their preferences

In [5]:
# Function that converts string into list
#ab = '([1,2,3], [2,1,3])'
def convertToList(ab):
    ab = ab[2:-2]
    ab = ab.split('], [')
    a = ab[0]
    b = ab[1]
    c = []
    for each in a:
        if each != ',' and each != ' ':
            c.append(each)
    c= list(map(int, c))
    d = []
    for each in b:
        if each != ',' and each != ' ':
            d.append(each)
    d= list(map(int, d))
    return c, d
#convertToList(ab)

In [6]:
# Get euclidean distance for one intent,.....with the incorporation of shopping preferences
# gym/sport, outdoor, business meeting....#17 for party, #9 for school/office, #2,#9,#17 for home, #9,#67 meetup with friends & family, #17 for club or bar
def per_col_dist1(j):
    col_tau = []
    for i in range(0, users1):
        if (df1.loc[i, j] == 'a'):
            pass
        else:
            ab = df1.loc[i, j]
            a, b = convertToList(ab)
            tau, p_value = stats.weightedtau(np.array([a]), np.array([b]))
            col_tau.append(tau)
    return col_tau
#per_col_dist1('club or bar')

In [7]:
# It calculates distances for all shopping intents with the incorporation of shopping preferences
def all_cols_dist1(total_intents1):
    per_intent_avg_dist = []
    for i in total_intents1:
        x = per_col_dist1(i)
        per_intent_avg_dist.append([i,x])
    return per_intent_avg_dist
#all_cols_dist1(total_intents1)

In [8]:
# Generate a dataframe with all distances for all intents with the incorporation of shopping preferences
score_list1 = all_cols_dist1(total_intents1)
def make_scores_df1(score_list):
    listing = []
    list_header = []
    for each in score_list:
        intent_wise_df = pd.Series(each[1])
        listing.append(intent_wise_df)
        list_header.append(f"{each[0]} with")
    scores_df = pd.concat(listing, axis=1)
    scores_df.columns = list_header
    scores_df = scores_df.assign(id=np.arange(52))
    return scores_df
#make_scores_df1(score_list1)

In [9]:
# Get euclidean distance for one intent,.....without incorporating shopping preferences
# gym/sport, outdoor, business meeting....#17 for party, #9 for school/office, #2,#9,#17 for home, #9,#67 meetup with friends & family, #17 for club or bar
def per_col_dist2(j):
    col_tau = []
    for i in range(0, users2):
        if (df2.loc[i, j] == 'a'):
            pass
        else:
            ab = df2.loc[i, j]
            a, b = convertToList(ab)
            tau, p_value = stats.weightedtau(np.array([a]), np.array([b]))
            col_tau.append(tau)
    return col_tau
#per_col_dist2('club or bar')

In [10]:
# It calculates distances for all shopping intents without incorporating shopping preferences
def all_cols_dist2(total_intents1):
    per_intent_avg_dist = []
    for i in total_intents1:
        x = per_col_dist2(i)
        per_intent_avg_dist.append([i,x])
    return per_intent_avg_dist
#all_cols_dist2(total_intents1)

In [11]:
# Generate a dataframe with all distances for all intents without incorporating shopping preferences
score_list2 = all_cols_dist2(total_intents1)
def make_scores_df2(score_list):
    listing = []
    list_header = []
    for each in score_list:
        intent_wise_df = pd.Series(each[1])
        listing.append(intent_wise_df)
        list_header.append(f"{each[0]} without")
    scores_df = pd.concat(listing, axis=1)
    scores_df.columns = list_header
    scores_df = scores_df.assign(id=np.arange(56))
    return scores_df
#make_scores_df2(score_list2)


In [12]:
# Merge the with and without user preferences based outcomes
df11 = make_scores_df1(score_list1)
df12 = make_scores_df2(score_list2)
final_scores_df1 = df11.merge(df12, how='right', on='id')
final_scores_df1 = final_scores_df1.drop(['id'], axis=1)
final_scores_df1

Unnamed: 0,home with,business meeting with,gym/sport with,party with,outdoor with,school/office with,meetup with friends & family with,club or bar with,home without,business meeting without,gym/sport without,party without,outdoor without,school/office without,meetup with friends & family without,club or bar without
0,0.019048,0.19708,1.0,-0.427007,0.6,-0.173333,-0.262774,-0.363636,1.0,-0.363636,0.09854,1.0,1.0,0.656934,0.763265,1.0
1,1.0,1.0,-0.30292,0.578467,0.520049,0.437956,1.0,1.0,-0.293333,-0.733333,1.0,0.306667,0.533333,0.52,-0.666667,-0.30292
2,1.0,1.0,1.0,1.0,1.0,-0.363636,0.545455,1.0,0.0,0.533333,-0.44,-0.655109,-0.293333,-0.363636,1.0,-1.0
3,-0.430657,1.0,1.0,0.47619,-0.568707,1.0,1.0,1.0,0.813333,-0.293333,0.813333,0.52,0.828571,0.441606,-0.126531,-0.363636
4,-0.363636,0.452555,1.0,1.0,1.0,-0.363636,0.656934,0.489796,-0.44,0.813333,1.0,-0.495238,0.213504,1.0,-0.363636,0.790476
5,0.306667,0.452555,1.0,0.280272,0.181818,1.0,0.526938,1.0,1.0,-0.173333,-0.427007,0.213504,1.0,-0.209524,-0.733333,-0.066667
6,-0.363636,1.0,1.0,1.0,0.0,1.0,-0.066667,-0.333333,0.52,1.0,0.533333,1.0,-0.173333,0.441606,1.0,-0.45034
7,1.0,1.0,0.680657,-1.0,-0.007299,-0.162409,1.0,0.2,0.528879,1.0,-0.354015,0.0,0.52862,0.09854,0.565693,1.0
8,0.306667,1.0,-0.173333,1.0,1.0,0.371429,0.52,0.181818,0.114964,-0.146465,0.680657,-0.518367,-1.0,0.671533,1.0,1.0
9,-0.149635,0.306667,-0.066667,1.0,1.0,1.0,1.0,0.543796,0.689796,1.0,0.817518,-0.293333,0.284672,-0.844891,0.451701,0.733333


In [13]:
# Save the all calculated scores in a file 
final_scores_df1.to_csv('scores_wtau.csv')

In [14]:
# Calculate overall mean and standard devaition for with and without inclusion of user preferences
df11_overall = df11.iloc[:,0:8]
p_mean = np.nanmean(df11_overall) 
p_std = np.nanstd(df11_overall)
print(f" overall mean is {p_mean}")
print(f" overall standard deviation is {p_std}")
df12_overall = df12.iloc[:,0:8]
q_mean = np.nanmean(df12_overall) 
q_std = np.nanstd(df12_overall)
print(f" overall mean is {q_mean}")
print(f" overall standard deviation is {q_std}")

 overall mean is 0.5623621483809254
 overall standard deviation is 0.5355475258933694
 overall mean is 0.2127263232020593
 overall standard deviation is 0.5933960332391564


In [15]:
# Normalize data
x = final_scores_df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
final_scores_df = pd.DataFrame(x_scaled)
final_scores_df.columns= final_scores_df1.columns
print(final_scores_df)

    home with  business meeting with  gym/sport with  party with  \
0    0.434066               0.476190        1.000000    0.286496   
1    1.000000               1.000000        0.248316    0.789234   
2    1.000000               1.000000        1.000000    1.000000   
3    0.174621               1.000000        1.000000    0.738095   
4    0.213287               0.642857        1.000000    1.000000   
5    0.600000               0.642857        1.000000    0.640136   
6    0.213287               1.000000        1.000000    1.000000   
7    1.000000               1.000000        0.815764    0.000000   
8    0.600000               1.000000        0.323077    1.000000   
9    0.336749               0.547683        0.384615    1.000000   
10   0.756808               1.000000        1.000000    1.000000   
11   0.627316               0.110390        1.000000    1.000000   
12   1.000000               0.347619        1.000000    0.600000   
13   1.000000               1.000000        1.00

In [16]:
final_scores_df.to_csv('norm_data_wtau.csv')

In [17]:
# Calculate the average and standard deviation values for calculated rank measure
for each in final_scores_df1.columns:
    a = list(final_scores_df1[each].dropna())
    a = np.array(a)    
    print(f"length of {each} is {a.size}")     
    print(f"average of {each} is {a.mean()}")
    print(f"std of {each} is {a.std()}")
   

length of home with is 52
average of home with is 0.4699902041621635
std of home with is 0.566444382448706
length of business meeting with is 43
average of business meeting with is 0.6033305936398874
std of business meeting with is 0.4869408979084124
length of gym/sport with is 31
average of gym/sport with is 0.6105814428432074
std of gym/sport with is 0.5289045866610831
length of party with is 36
average of party with is 0.5188684723084331
std of party with is 0.639120005076046
length of outdoor with is 49
average of outdoor with is 0.5543436862144893
std of outdoor with is 0.465136249243258
length of school/office with is 26
average of school/office with is 0.6410866020104247
std of school/office with is 0.5307681917287927
length of meetup with friends & family with is 18
average of meetup with friends & family with is 0.6907230809466767
std of meetup with friends & family with is 0.4612219288928803
length of club or bar with is 33
average of club or bar with is 0.536551367445291
std

In [18]:
# Calculate an example t-test for shopping intent of "Business Meeting"
for each in final_scores_df1.columns:
    if each=='business meeting with':
        a = list(final_scores_df1[each].dropna())
    if each=='business meeting without':
        b = list(final_scores_df1[each].dropna())
result = pg.ttest(a,b,correction=True)
print(result)

               T        dof alternative     p-val         CI95%   cohen-d  \
T-test  2.476893  83.804076   two-sided  0.015263  [0.06, 0.51]  0.530257   

         BF10   power  
T-test  3.137  0.6861  


In [19]:
# It calculates the t-test results for all shopping intents
t_test_result = []
x = final_scores_df1.columns
i = 0; j = 8
for k in range(0, 8):
    a_size = final_scores_df1[x[i]].dropna().values.size
    b_size = final_scores_df1[x[j]].dropna().values.size
    if a_size <= b_size:
        a_sample = random.sample(final_scores_df1[x[i]].dropna().values.tolist(), a_size)
        b_sample = random.sample(final_scores_df1[x[j]].dropna().values.tolist(), a_size)
        a1_size = a_size
        b1_size = a_size
        result = pg.ttest(a_sample,b_sample,correction=True)
        t_test_result.append(a1_size)
        t_test_result.append(result)
    elif a_size > b_size:
        a_sample = random.sample(final_scores_df1[x[i]].dropna().values.tolist(), b_size)
        b_sample = random.sample(final_scores_df1[x[j]].dropna().values.tolist(), b_size)
        a1_size = b_size
        b1_size = b_size
        result = pg.ttest(a_sample,b_sample,correction=True)
        t_test_result.append(a1_size)
        t_test_result.append(result)
    i = i + 1
    j = j + 1
t_test_result


[52,
                T         dof alternative     p-val         CI95%   cohen-d  \
 T-test  1.714354  101.440576   two-sided  0.089517  [-0.03, 0.4]  0.336213   
 
          BF10     power  
 T-test  0.762  0.396884  ,
 43,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.329678  82.456274   two-sided  0.022269  [0.04, 0.49]  0.502431   
 
          BF10     power  
 T-test  2.332  0.634217  ,
 26,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.470045  48.654525   two-sided  0.017065  [0.07, 0.68]  0.685067   
 
          BF10     power  
 T-test  3.193  0.678115  ,
 36,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.877351  69.953691   two-sided  0.005313  [0.13, 0.73]  0.678198   
 
          BF10     power  
 T-test  7.619  0.809948  ,
 49,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  3.297008  89.843857   two-sided  0

In [20]:
# It makes a text file incorporating all results of t-test calculation
f = open("results_for_t_test_after_sampling_wtau.txt", "a")
x = final_scores_df.columns
i = 0; j = 8
for k in range(0, 8):
    a_size = final_scores_df[x[i]].dropna().values.size
    b_size = final_scores_df[x[j]].dropna().values.size
    if a_size <= b_size:
        a_sample = random.sample(final_scores_df[x[i]].dropna().values.tolist(), a_size)
        b_sample = random.sample(final_scores_df[x[j]].dropna().values.tolist(), a_size)
        a_mean = np.array(a_sample).mean()
        a_std = np.array(a_sample).std()
        b_mean = np.array(b_sample).mean()
        b_std = np.array(b_sample).std()
        a1_size = a_size
        b1_size = a_size
    elif a_size > b_size:
        a_sample = random.sample(final_scores_df[x[i]].dropna().values.tolist(), b_size)
        b_sample = random.sample(final_scores_df[x[j]].dropna().values.tolist(), b_size)
        a_mean = np.array(a_sample).mean()
        a_std = np.array(a_sample).std()
        b_mean = np.array(b_sample).mean()
        b_std = np.array(b_sample).std()
        a1_size = b_size
        b1_size = b_size
    f.write(f'{x[i]}\n {a1_size}, {a_mean}, {a_std}\n')
    f.write(f'{x[j]}\n {b1_size}, {b_mean}, {b_std}\n')
    i = i + 1
    j = j + 1
f.close()

In [21]:
# It prints file content containing t-test results
df = pd.read_csv('results_for_t_test_after_sampling_wtau.txt')
print(df.iloc[:,0])

 52                                   0.694225    0.326795
home without                          NaN              NaN
 52                                   0.629204    0.260872
business meeting with                 NaN              NaN
 43                                   0.741220    0.317671
business meeting without              NaN              NaN
 43                                   0.656698    0.283248
gym/sport with                        NaN              NaN
 26                                   0.752841    0.321726
gym/sport without                     NaN              NaN
 26                                   0.613496    0.290122
party with                            NaN              NaN
 36                                   0.759434    0.319560
party without                         NaN              NaN
 36                                   0.531481    0.317984
outdoor with                          NaN              NaN
 49                                   0.715909    0.2965