In [9]:
import pandas as pd

#import candy data
df = pd.read_csv("candyhierarchy2017UTF8.csv")

In [11]:
#Only keep the candy columns
df2 = df.iloc[:,6:109]
df2.head()

#remove the Q6 from the column names
df2.columns = [name.replace('Q6 | ', '') for name in df2.columns.values.tolist()]

#count the occurances of "Despair", "Joy" and "Meh" in each column
ratings = df2.apply(pd.Series.value_counts)

In [12]:
#calculate joy - despair for each column
l = [ratings[col][1]-ratings[col][0] for col in ratings]
s = pd.Series(l, index=ratings.columns, name="SCORE")
ratings = ratings.drop(['JOY','DESPAIR','MEH'])
ratings = ratings.append(s)

In [13]:
#transpose the matrix so the candies are the rows
ratings=ratings.T

#sort the column from most joyful to least
#show the top 5 candies
ratings = ratings.sort_values(by=['SCORE'], ascending=[False])
ratings.head()

Unnamed: 0,SCORE
Any full-sized candy bar,1542
ReeseÕs Peanut Butter Cups,1403
Kit Kat,1395
"Cash, or other forms of legal tender",1374
Twix,1342


In [2]:
#For each survey participant, count the number of candies that bring despair and subtract this from the number of candies
#that bring joy.  Call this the "love of candy".  Group participants into male and female.

from collections import Counter

def get_counts(gender):
    return [Counter(df.iloc[i,6:109]) if df.iloc[i,2]==gender else -1 for i in range(len(df))]

def calculate_love_of_candy(counts):
    summary = []
    for i in range(len(counts)):
        if counts[i]!=-1:
            summary.append(counts[i]['JOY']-counts[i]['DESPAIR'])
    return summary
        
male_love_of_candy = calculate_love_of_candy(get_counts("Male"))
female_love_of_candy = calculate_love_of_candy(get_counts("Female"))

In [3]:
#Calculate the mean love of candy for males and females, as well as the standard deviation in love of candy

def mean(summary):
    return sum(summary)/len(summary)

def stdev(summary):
    m = mean(summary)
    variance = sum([(x_i - m)**2 for x_i in summary]) / (len(summary)-1)
    return variance**0.5

print("male love of candy mean: " + str(mean(male_love_of_candy)))
print("female love of candy mean: " + str(mean(female_love_of_candy)))
print("male love of candy standard deviation: " + str(stdev(male_love_of_candy)))
print("female love of candy standard deviation: " + str(stdev(female_love_of_candy)))

male love of candy mean: 2.052488070892979
female love of candy mean: 3.0762812872467222
male love of candy standard deviation: 19.561268327144244
female love of candy standard deviation: 18.742098350873498


In [5]:
#Run a t test to see if there is any significant difference in the love_of_candy between males and females

t_stat = (mean(female_love_of_candy) - mean(male_love_of_candy)) / (stdev(male_love_of_candy)**2/len(male_love_of_candy) + stdev(female_love_of_candy)**2/len(female_love_of_candy))**0.5
d_of_f = len(male_love_of_candy)+len(female_love_of_candy)-2

print("t statistic: " + str(t_stat))
print("degress of freedom: " + str(d_of_f))
print("p value: 0.214363")
print("The results show no significant difference in the love of candy between males and females")

t statistic: 1.241982531670616
degress of freedom: 2304
p value: 0.214363
The results show no significant difference in the love of candy between males and females
