In [1]:
# Packages need to import
import pandas as pd
import numpy as np
import random
import scipy.stats as stats
from sklearn.metrics.pairwise import cosine_similarity
from statsmodels.stats.weightstats import ttest_ind
import pingouin as pg
from sklearn import preprocessing

In [2]:
# Load data of clothing items ranking
df1 = pd.read_excel('with_prefer_a.xlsx') # including user preferences
df2 = pd.read_excel('without_prefer_a.xlsx') # without including user preferences

In [3]:
# The shopping intents that have to analyse
total_intents = ['Home', 'Business Meeting', 'Gym/Sport', 'Party', 'Outdoor', 'School/Office', 'Meetup with Friends & Family', 'Club or Bar']
total_intents1 = []
for intent in total_intents:
    total_intents1.append(intent.lower())
print(total_intents1)

['home', 'business meeting', 'gym/sport', 'party', 'outdoor', 'school/office', 'meetup with friends & family', 'club or bar']


In [4]:
users1 = df1.shape[0] # number of users along their preferences
users2 = df2.shape[0] # number of user without their preferences

In [5]:
# Function that converts string into list
#ab = '([1,2,3], [2,1,3])'
def convertToList(ab):
    ab = ab[2:-2]
    ab = ab.split('], [')
    a = ab[0]
    b = ab[1]
    c = []
    for each in a:
        if each != ',' and each != ' ':
            c.append(each)
    c= list(map(int, c))
    d = []
    for each in b:
        if each != ',' and each != ' ':
            d.append(each)
    d= list(map(int, d))
    return c, d
#convertToList(ab)

In [6]:
# Get euclidean distance for one intent,.....with the incorporation of shopping preferences
# gym/sport, outdoor, business meeting....#17 for party, #9 for school/office, #2,#9,#17 for home, #9,#67 meetup with friends & family, #17 for club or bar
def per_col_dist1(j):
    col_tau = []
    for i in range(0, users1):
        if (df1.loc[i, j] == 'a'):
            pass
        else:
            ab = df1.loc[i, j]
            a, b = convertToList(ab)
            statistic, p_value = stats.spearmanr(a, b)
            col_tau.append(statistic)
    return col_tau
#per_col_dist1('club or bar')

In [7]:
# It calculates distances for all shopping intents with the incorporation of shopping preferences
def all_cols_dist1(total_intents1):
    per_intent_avg_dist = []
    for i in total_intents1:
        x = per_col_dist1(i)
        per_intent_avg_dist.append([i,x])
    return per_intent_avg_dist
#all_cols_dist1(total_intents1)

In [8]:
# Generate a dataframe with all distances for all intents with the incorporation of shopping preferences
score_list1 = all_cols_dist1(total_intents1)
def make_scores_df1(score_list):
    listing = []
    list_header = []
    for each in score_list:
        intent_wise_df = pd.Series(each[1])
        listing.append(intent_wise_df)
        list_header.append(f"{each[0]} with")
    scores_df = pd.concat(listing, axis=1)
    scores_df.columns = list_header
    scores_df = scores_df.assign(id=np.arange(52))
    return scores_df
#make_scores_df1(score_list1)

In [9]:
# Get euclidean distance for one intent,.....
# gym/sport, outdoor, business meeting....#17 for party, #9 for school/office, #2,#9,#17 for home, #9,#67 meetup with friends & family, #17 for club or bar
def per_col_dist2(j):
    col_tau = []
    for i in range(0, users2):
        if (df2.loc[i, j] == 'a'):
            pass
        else:
            ab = df2.loc[i, j]
            a, b = convertToList(ab)
            tau, p_value = stats.spearmanr(a, b)
            col_tau.append(tau)
    return col_tau
#per_col_dist2('club or bar')

In [10]:
# It calculates distances for all shopping intents without incorporating shopping preferences
def all_cols_dist2(total_intents1):
    per_intent_avg_dist = []
    for i in total_intents1:
        x = per_col_dist2(i)
        per_intent_avg_dist.append([i,x])
    return per_intent_avg_dist
#all_cols_dist2(total_intents1)

In [11]:
# Generate a dataframe with all distances for all intents without incorporating shopping preferences
score_list2 = all_cols_dist2(total_intents1)
def make_scores_df2(score_list):
    listing = []
    list_header = []
    for each in score_list:
        intent_wise_df = pd.Series(each[1])
        listing.append(intent_wise_df)
        list_header.append(f"{each[0]} without")
    scores_df = pd.concat(listing, axis=1)
    scores_df.columns = list_header
    scores_df = scores_df.assign(id=np.arange(56))
    return scores_df
#make_scores_df2(score_list2)


In [12]:
# Merge the with and without user preferences based outcomes
df11 = make_scores_df1(score_list1)
df12 = make_scores_df2(score_list2)
final_scores_df1 = df11.merge(df12, how='right', on='id')
final_scores_df1 = final_scores_df1.drop(['id'], axis=1)
final_scores_df1

Unnamed: 0,home with,business meeting with,gym/sport with,party with,outdoor with,school/office with,meetup with friends & family with,club or bar with,home without,business meeting without,gym/sport without,party without,outdoor without,school/office without,meetup with friends & family without,club or bar without
0,-0.028571,0.6,1.0,-0.6,0.428571,0.2,-0.2,-0.5,1.0,-0.5,0.5,1.0,1.0,0.6,0.657143,1.0
1,1.0,1.0,-0.4,0.5,0.75,0.2,1.0,1.0,-0.4,-0.8,1.0,0.2,0.4,0.8,-0.8,-0.4
2,1.0,1.0,1.0,1.0,1.0,-0.5,0.5,1.0,0.0,0.4,-0.4,-0.7,-0.4,-0.5,1.0,-1.0
3,-0.5,1.0,1.0,0.485714,-0.6,1.0,1.0,1.0,0.8,-0.4,0.8,0.8,0.828571,0.6,-0.085714,-0.5
4,-0.5,0.7,1.0,1.0,1.0,-0.5,0.6,0.714286,-0.4,0.8,1.0,-0.714286,0.3,1.0,-0.5,0.885714
5,0.2,0.3,1.0,0.542857,0.5,1.0,0.833333,1.0,1.0,0.2,-0.6,0.3,1.0,-0.2,-0.8,-0.2
6,-0.5,1.0,1.0,1.0,0.0,1.0,-0.2,-0.6,0.8,1.0,0.4,1.0,0.2,0.6,1.0,-0.6
7,1.0,1.0,0.7,-1.0,-0.1,-0.3,1.0,0.4,0.416667,1.0,-0.6,0.0,0.857143,0.5,0.4,1.0
8,0.2,1.0,0.2,1.0,1.0,0.2,0.8,0.5,0.3,-0.285714,0.7,-0.657143,-1.0,0.9,1.0,1.0
9,-0.1,0.2,-0.2,1.0,1.0,1.0,1.0,0.8,0.542857,1.0,0.9,-0.4,0.1,-0.9,0.542857,0.8


In [13]:
# Save the all calculated scores in a file 
final_scores_df1.to_csv('scores_spearman.csv')

In [14]:
# Calculate overall mean and standard devaition for with and without inclusion of user preferences
df11_overall = df11.iloc[:,0:8]
p_mean = np.nanmean(df11_overall) 
p_std = np.nanstd(df11_overall)
print(f" overall mean is {p_mean}")
print(f" overall standard deviation is {p_std}")
df12_overall = df12.iloc[:,0:8]
q_mean = np.nanmean(df12_overall) 
q_std = np.nanstd(df12_overall)
print(f" overall mean is {q_mean}")
print(f" overall standard deviation is {q_std}")

 overall mean is 0.5686177248677249
 overall standard deviation is 0.5611352914556083
 overall mean is 0.21190392833525054
 overall standard deviation is 0.6415099691262338


In [15]:
# Normalize data
from sklearn import preprocessing
x = final_scores_df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
final_scores_df = pd.DataFrame(x_scaled)
final_scores_df.columns= final_scores_df1.columns
print(final_scores_df)

    home with  business meeting with  gym/sport with  party with  \
0    0.428571               0.733333        1.000000    0.200000   
1    1.000000               1.000000        0.222222    0.750000   
2    1.000000               1.000000        1.000000    1.000000   
3    0.166667               1.000000        1.000000    0.742857   
4    0.166667               0.800000        1.000000    1.000000   
5    0.555556               0.533333        1.000000    0.771429   
6    0.166667               1.000000        1.000000    1.000000   
7    1.000000               1.000000        0.833333    0.000000   
8    0.555556               1.000000        0.555556    1.000000   
9    0.388889               0.466667        0.333333    1.000000   
10   0.722222               1.000000        1.000000    1.000000   
11   0.777778               0.000000        1.000000    1.000000   
12   1.000000               0.333333        1.000000    0.700000   
13   1.000000               1.000000        1.00

In [16]:
final_scores_df.to_csv('norm_data_spearman.csv')

In [17]:
# Calculate the average and standard deviation values for calculated rank measure
for each in final_scores_df1.columns:
    a = list(final_scores_df1[each].dropna())
    a = np.array(a)  
    print(f"length of {each} is {a.size}")    
    print(f"average of {each} is {a.mean()}")
    print(f"std of {each} is {a.std()}")
   

length of home with is 52
average of home with is 0.4603021978021979
std of home with is 0.6131752892640786
length of business meeting with is 43
average of business meeting with is 0.6122369878183831
std of business meeting with is 0.49349899661734764
length of gym/sport with is 31
average of gym/sport with is 0.6230414746543779
std of gym/sport with is 0.5343337347423156
length of party with is 36
average of party with is 0.5157407407407407
std of party with is 0.67530851962627
length of outdoor with is 49
average of outdoor with is 0.5862973760932945
std of outdoor with is 0.45848662906913756
length of school/office with is 26
average of school/office with is 0.628021978021978
std of school/office with is 0.5618493902733734
length of meetup with friends & family with is 18
average of meetup with friends & family with is 0.7042328042328043
std of meetup with friends & family with is 0.4913945501350888
length of club or bar with is 33
average of club or bar with is 0.541991341991342
s

In [18]:
# Calculate an example t-test for shopping intent of "Business Meeting"
for each in final_scores_df1.columns:
    if each=='business meeting with':
        a = list(final_scores_df1[each].dropna())
        a = a[0:51]
    if each=='business meeting without':
        b = list(final_scores_df1[each].dropna())
        b = b[0:51]
result = pg.ttest(a,b,correction=True)
print(result)

              T        dof alternative     p-val         CI95%   cohen-d  \
T-test  2.42464  81.845997   two-sided  0.017526  [0.05, 0.53]  0.518609   

         BF10     power  
T-test  2.816  0.666813  


In [19]:
# It calculates the t-test results for all shopping intents
t_test_result = []
x = final_scores_df1.columns
i = 0; j = 8
for k in range(0, 8):
    a_size = final_scores_df1[x[i]].dropna().values.size
    b_size = final_scores_df1[x[j]].dropna().values.size
    if a_size <= b_size:
        a_sample = random.sample(final_scores_df1[x[i]].dropna().values.tolist(), a_size)
        b_sample = random.sample(final_scores_df1[x[j]].dropna().values.tolist(), a_size)
        a1_size = a_size
        b1_size = a_size
        result = pg.ttest(a_sample,b_sample,correction=True)
        t_test_result.append(a1_size)
        t_test_result.append(result)
    elif a_size > b_size:
        a_sample = random.sample(final_scores_df1[x[i]].dropna().values.tolist(), b_size)
        b_sample = random.sample(final_scores_df1[x[j]].dropna().values.tolist(), b_size)
        a1_size = b_size
        b1_size = b_size
        result = pg.ttest(a_sample,b_sample,correction=True)
        t_test_result.append(a1_size)
        t_test_result.append(result)
    i = i + 1
    j = j + 1
t_test_result

[52,
                T         dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.069607  101.724312   two-sided  0.041023  [0.01, 0.48]  0.405883   
 
          BF10    power  
 T-test  1.372  0.53594  ,
 43,
                T       dof alternative     p-val         CI95%  cohen-d  \
 T-test  2.303893  80.04355   two-sided  0.023822  [0.04, 0.52]  0.49687   
 
          BF10    power  
 T-test  2.217  0.62459  ,
 26,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.323585  49.091238   two-sided  0.024332  [0.05, 0.72]  0.644446   
 
          BF10     power  
 T-test  2.432  0.625081  ,
 36,
                T        dof alternative     p-val         CI95%   cohen-d  \
 T-test  2.816522  69.999053   two-sided  0.006303  [0.13, 0.78]  0.663861   
 
          BF10     power  
 T-test  6.629  0.793243  ,
 49,
                T        dof alternative     p-val        CI95%   cohen-d  \
 T-test  2.854325  87.627493   two-sided  0.005382  

In [22]:
# It makes a text file incorporating all results of t-test calculation
f = open("results_for_t_test_after_sampling_spearman.txt", "a")
x = final_scores_df.columns
i = 0; j = 8
for k in range(0, 8):
    a_size = final_scores_df[x[i]].dropna().values.size
    b_size = final_scores_df[x[j]].dropna().values.size
    if a_size <= b_size:
        a_sample = random.sample(final_scores_df[x[i]].dropna().values.tolist(), a_size)
        b_sample = random.sample(final_scores_df[x[j]].dropna().values.tolist(), a_size)
        a_mean = np.array(a_sample).mean()
        a_std = np.array(a_sample).std()
        b_mean = np.array(b_sample).mean()
        b_std = np.array(b_sample).std()
        a1_size = a_size
        b1_size = a_size
    elif a_size > b_size:
        a_sample = random.sample(final_scores_df[x[i]].dropna().values.tolist(), b_size)
        b_sample = random.sample(final_scores_df[x[j]].dropna().values.tolist(), b_size)
        a_mean = np.array(a_sample).mean()
        a_std = np.array(a_sample).std()
        b_mean = np.array(b_sample).mean()
        b_std = np.array(b_sample).std()
        a1_size = b_size
        b1_size = b_size

    f.write(f'{x[i]}\n {a1_size}, {a_mean}, {a_std}\n')
    f.write(f'{x[j]}\n {b1_size}, {b_mean}, {b_std}\n')
    i = i + 1
    j = j + 1
f.close()

In [23]:
# It prints file content containing t-test results
df = pd.read_csv('results_for_t_test_after_sampling_spearman.txt')
print(df.iloc[:,0])

 52                                   0.700168    0.340653
home without                          NaN              NaN
 52                                   0.627015    0.280055
business meeting with                 NaN              NaN
 43                                   0.741491    0.328999
business meeting without              NaN              NaN
 43                                   0.660795    0.311793
gym/sport with                        NaN              NaN
 26                                   0.816545    0.268891
gym/sport without                     NaN              NaN
 26                                   0.622802    0.315003
party with                            NaN              NaN
 36                                   0.757870    0.337654
party without                         NaN              NaN
 36                                   0.537500    0.322834
outdoor with                          NaN              NaN
 49                                   0.741436    0.2865