Two Observable Trends in the data

1) One observable trend from this data is that  majority of men are video game players, almost 85%, but women actually pay more on average per game. Additionally, non-closed genders pay even more than male or female players on average.

2) The most profitable items don't always equal the most popular items. While half of them overlap, there are more profitable games that are not the most popular. It would be interesting to look into why those games aren't more popular and how to change it to appeal more to audiences.

In [514]:
#import pandas
import pandas as pd

In [515]:
#load file "purchase_data_copy.csv"
load_file = "purchase_data_copy.csv"

#read csv file into panda data frame
purchase_data_df = pd.read_csv(load_file)
purchase_data_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [516]:
#breakdown of data with duplicates
purchase_data_df.describe()

Unnamed: 0,Purchase ID,Age,Item ID,Price
count,780.0,780.0,780.0,780.0
mean,389.5,22.714103,91.755128,3.050987
std,225.310896,6.659444,52.697702,1.169549
min,0.0,7.0,0.0,1.0
25%,194.75,20.0,47.75,1.98
50%,389.5,22.0,92.0,3.15
75%,584.25,25.0,138.0,4.08
max,779.0,45.0,183.0,4.99


In [520]:
#Find total number of players without duplicates
drop_dup = purchase_data_df["SN"].drop_duplicates()
drop_dup

0            Lisim78
1        Lisovynya38
2         Ithergue48
3      Chamassasya86
4          Iskosia90
           ...      
773           Hala31
774       Jiskjask80
775       Aethedru70
777       Yathecal72
778          Sisur91
Name: SN, Length: 576, dtype: object

In [521]:
#create summary of total number of players
total_player_data = {
    "Total Players": [576]
}
info_summary = pd.DataFrame(total_player_data, columns=["Total Players"])
info_summary

Unnamed: 0,Total Players
0,576


In [522]:
#unique number of items
unique = purchase_data_df["Item Name"].unique()
unique



array(['Extraction, Quickblade Of Trembling Hands', 'Frenzied Scimitar',
       'Final Critic', 'Blindscythe', 'Fury', 'Dreamkiss',
       'Interrogator, Blood Blade of the Queen', 'Abyssal Shard',
       'Souleater', 'Ghastly Adamantite Protector',
       'Singed Onyx Warscythe', 'Renewed Skeletal Katana',
       "Bloodlord's Fetish", 'Bone Crushing Silver Skewer',
       'Deadline, Voice Of Subtlety', 'Second Chance', 'Devine',
       'Nirvana', 'Blazefury, Protector of Delusions',
       'Despair, Favor of Due Diligence',
       'Sun Strike, Jaws of Twisted Visions', 'Warped Fetish',
       'Severance', 'Persuasion',
       'Oathbreaker, Last Hope of the Breaking Storm', 'Demise',
       'Blood-Forged Skeletal Spine',
       'Stormbringer, Dark Blade of Ending Misery',
       'Shadow Strike, Glory of Ending Hope', 'Striker',
       'Wolf, Promise of the Moonwalker', "Faith's Scimitar",
       'Bonecarvin Battle Axe', 'Azurewrath', 'Vengeance Cleaver',
       'Haunted Bronzed Bludgeo

In [523]:
#find unique items, used duplicate to ensure number is accurate
duplicate = purchase_data_df["Item Name"].drop_duplicates()
duplicate

0      Extraction, Quickblade Of Trembling Hands
1                              Frenzied Scimitar
2                                   Final Critic
3                                    Blindscythe
4                                           Fury
                         ...                    
664                  Alpha, Reach of Ending Hope
673                                        Alpha
700                                     Betrayer
717     Winterthorn, Defender of Shifting Worlds
727                           Gladiator's Glaive
Name: Item Name, Length: 179, dtype: object

In [524]:
#avg price for purchasing total
average = purchase_data_df["Price"].mean()
average

3.050987179487176

In [525]:
#number of purchase for purchasing total
number_of = purchase_data_df["Purchase ID"].value_counts()
print(purchase_data_df["Purchase ID"].count())

780


In [526]:
#total revenue for purchasing total
total = purchase_data_df["Price"].sum()
total

2379.77

In [527]:
#summary of purchasing total
summary_data = {
    "Total Players": [576],
    "Number Of Unique Items": [179],
    "Average Price": [3.05],
    "Number Of Purchase": [780],
    "Total Revenue": [2379.77]
}
info_summary = pd.DataFrame(summary_data, columns=["Total Players", "Number Of Unique Items", "Average Price", "Number Of Purchase", "Total Revenue"])
info_summary["Average Price"] = info_summary["Average Price"].map("${:,.2f}".format)
info_summary["Total Revenue"] = info_summary["Total Revenue"].map("${:,.2f}".format)
info_summary


Unnamed: 0,Total Players,Number Of Unique Items,Average Price,Number Of Purchase,Total Revenue
0,576,179,$3.05,780,"$2,379.77"


In [528]:
#gender demographics breakdown of total count
unique_players = purchase_data_df.loc[:,["Gender", "SN"]].drop_duplicates()
gender_df = unique_players["Gender"].value_counts()
gender_df

Male                     484
Female                    81
Other / Non-Disclosed     11
Name: Gender, dtype: int64

In [529]:
#find percentages of total players by gender
unique_players = purchase_data_df.loc[:,["Gender", "SN"]].drop_duplicates()
gender_df = unique_players["Gender"].value_counts()
gender_df

percent = (gender_df / 576) 
percent

Male                     0.840278
Female                   0.140625
Other / Non-Disclosed    0.019097
Name: Gender, dtype: float64

In [530]:
#Create gender summary table with total count and percentage for gender

gender_breakdown_df = pd.DataFrame({"Total Count": gender_df, "Percentage": percent})
gender_breakdown_df["Percentage"] = gender_breakdown_df["Percentage"].map("{:,.2%}".format)
gender_breakdown_df


Unnamed: 0,Total Count,Percentage
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


In [531]:
#purchasing analysis by gender breakdown gender by loc and getting rid of duplicates
analysis_gender = purchase_data_df.loc[:,["Gender", "Purchase ID"]].drop_duplicates()
analysis_df = analysis_gender["Gender"].value_counts()
analysis_df

Male                     652
Female                   113
Other / Non-Disclosed     15
Name: Gender, dtype: int64

In [532]:
#average purchase price per gender

analysis_gender = purchase_data_df.loc[:,["Gender", "Price"]]
analysis_gender

men = analysis_gender.loc[analysis_gender["Gender"] == "Male", :]
print(men)

average_men = men["Price"].mean()
average_men 

female = analysis_gender.loc[analysis_gender["Gender"] == "Female", :]
print(female)

average_female = female["Price"].mean()
average_female 

other = analysis_gender.loc[analysis_gender["Gender"] == "Other / Non-Disclosed", :]
print(other)

average_other = other["Price"].mean()
average_other

    Gender  Price
0     Male   3.53
1     Male   1.56
2     Male   4.88
3     Male   3.27
4     Male   1.44
..     ...    ...
774   Male   4.19
776   Male   1.63
777   Male   3.46
778   Male   4.19
779   Male   4.60

[652 rows x 2 columns]
     Gender  Price
15   Female   2.89
18   Female   4.90
38   Female   4.18
41   Female   1.33
55   Female   3.79
..      ...    ...
731  Female   1.02
740  Female   3.92
754  Female   4.05
767  Female   4.88
775  Female   3.54

[113 rows x 2 columns]
                    Gender  Price
9    Other / Non-Disclosed   3.58
22   Other / Non-Disclosed   3.81
82   Other / Non-Disclosed   4.40
111  Other / Non-Disclosed   4.75
228  Other / Non-Disclosed   3.39
237  Other / Non-Disclosed   3.55
242  Other / Non-Disclosed   3.94
291  Other / Non-Disclosed   3.45
350  Other / Non-Disclosed   2.22
401  Other / Non-Disclosed   1.33
484  Other / Non-Disclosed   3.94
549  Other / Non-Disclosed   3.10
629  Other / Non-Disclosed   2.18
637  Other / Non-Disclosed   3.4

3.3460000000000005

In [533]:
#total purchase value by gender
total_gender = purchase_data_df.loc[:,["Gender", "Price"]]
analysis_gender

men_sum = total_gender.loc[analysis_gender["Gender"] == "Male", :]
print(men_sum)

sum_men = men["Price"].sum()
sum_men

female_sum = total_gender.loc[analysis_gender["Gender"] == "Female", :]
print(female_sum)

sum_female = female["Price"].sum()
sum_female

other_sum = total_gender.loc[analysis_gender["Gender"] == "Other / Non-Disclosed", :]
print(female_sum)

sum_other = other["Price"].sum()
sum_other


    Gender  Price
0     Male   3.53
1     Male   1.56
2     Male   4.88
3     Male   3.27
4     Male   1.44
..     ...    ...
774   Male   4.19
776   Male   1.63
777   Male   3.46
778   Male   4.19
779   Male   4.60

[652 rows x 2 columns]
     Gender  Price
15   Female   2.89
18   Female   4.90
38   Female   4.18
41   Female   1.33
55   Female   3.79
..      ...    ...
731  Female   1.02
740  Female   3.92
754  Female   4.05
767  Female   4.88
775  Female   3.54

[113 rows x 2 columns]
     Gender  Price
15   Female   2.89
18   Female   4.90
38   Female   4.18
41   Female   1.33
55   Female   3.79
..      ...    ...
731  Female   1.02
740  Female   3.92
754  Female   4.05
767  Female   4.88
775  Female   3.54

[113 rows x 2 columns]


50.19

In [209]:
#average total purchase per person



In [412]:
#gender summary table with purchase counts, avg purchase price, total purchase value, and avg total purchase per person
gender_summary_data = {
    "Purchase Count": [113, 652, 15],
    "Average Purchase Price": [3.20, 3.02, 3.35],
    "Total Purchase Value": [361.94, 1967.64, 50.19],
    "Avg Total Purchase Per Person": [100, 100, 100]
}
gender_breakdown = pd.DataFrame(gender_summary_data, columns=["Purchase Count", "Average Purchase Price", "Total Purchase Value", "Avg Total Purchase Per Person"])
gender_breakdown

# gender_summary_data = pd.DataFrame({"Purchase Count": analysis_df, "Average Purchase Price": men, female, other "Total Purchase Value":sum_men, female_sum, sum_other})
# #gender_breakdown_df["Percentage"] = gender_breakdown_df["Percentage"].map("{:,.2%}".format)
# gender_summary_data

Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase Per Person
0,113,3.2,361.94,100
1,652,3.02,1967.64,100
2,15,3.35,50.19,100


In [562]:
#establish bins for ages
bins = [0, 10, 15, 20, 25, 30, 35, 40, 45]

group_labels = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", 
                "35-39", "40+"]

In [563]:
#pd.cut for bins
pd.cut(purchase_data_df["Age"], bins, labels=group_labels).head()

0    15-19
1    35-39
2    20-24
3    20-24
4    20-24
Name: Age, dtype: category
Categories (8, object): [<10 < 10-14 < 15-19 < 20-24 < 25-29 < 30-34 < 35-39 < 40+]

In [564]:
#group age for bins
age_group = purchase_data_df.groupby("Age")

In [565]:
#total count broken down by age
total_count_age = age_group["SN"].nunique()
percent_age = (total_count_age / 780)
percent_age

Age
7     0.008974
8     0.007692
9     0.005128
10    0.008974
11    0.007692
12    0.005128
13    0.003846
14    0.002564
15    0.033333
16    0.030769
17    0.024359
18    0.026923
19    0.021795
20    0.088462
21    0.055128
22    0.062821
23    0.062821
24    0.061538
25    0.055128
26    0.014103
27    0.011538
28    0.005128
29    0.012821
30    0.032051
31    0.006410
32    0.007692
33    0.011538
34    0.008974
35    0.012821
36    0.006410
37    0.006410
38    0.006410
39    0.007692
40    0.006410
41    0.002564
42    0.001282
43    0.001282
44    0.002564
45    0.001282
Name: SN, dtype: float64

In [566]:
#summary table total count and percentage of players broken down by age
age_summary_table = pd.DataFrame({"Total Count": total_count_age, "Percentage": percent_age})
age_summary_table["Percentage"] = age_summary_table["Percentage"].map("{:,.2%}".format)
age_summary_table

Unnamed: 0_level_0,Total Count,Percentage
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
7,7,0.90%
8,6,0.77%
9,4,0.51%
10,7,0.90%
11,6,0.77%
12,4,0.51%
13,3,0.38%
14,2,0.26%
15,26,3.33%
16,24,3.08%


In [567]:
#purchase analaysis purchase count data
age_average = purchase_data_df.groupby("Age")
age_group[["Purchase ID"]].count()

Unnamed: 0_level_0,Purchase ID
Age,Unnamed: 1_level_1
7,9
8,8
9,6
10,9
11,7
12,6
13,4
14,2
15,35
16,30


In [568]:
#purchase analaysis average purchase price data
total_purchase_value = purchase_data_df.groupby("Age")
age_group[["Price"]].mean()

Unnamed: 0_level_0,Price
Age,Unnamed: 1_level_1
7,3.654444
8,3.24625
9,3.045
10,3.536667
11,2.684286
12,2.633333
13,2.3625
14,3.455
15,3.018571
16,3.018667


In [569]:
#purchase analysis total purchase value
total_purchase_value = purchase_data_df.groupby("Age")
age_group[["Price"]].sum()

Unnamed: 0_level_0,Price
Age,Unnamed: 1_level_1
7,32.89
8,25.97
9,18.27
10,31.83
11,18.79
12,15.8
13,9.45
14,6.91
15,105.65
16,90.56


In [None]:
#average total purchase per person
avg_total_purchase = 

In [570]:
#age analysis table
age_data_table = pd.DataFrame({"Purchase Cost": age_group})
age_data_table

ValueError: If using all scalar values, you must pass an index

In [579]:
#check original data set
purchase_data_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [580]:
#focus on SN, Price, Purchase ID
top_spender_eliminate = purchase_data_df[["SN", "Price", "Purchase ID"]]
top_spender_eliminate

Unnamed: 0,SN,Price,Purchase ID
0,Lisim78,3.53,0
1,Lisovynya38,1.56,1
2,Ithergue48,4.88,2
3,Chamassasya86,3.27,3
4,Iskosia90,1.44,4
...,...,...,...
775,Aethedru70,3.54,775
776,Iral74,1.63,776
777,Yathecal72,3.46,777
778,Sisur91,4.19,778


In [581]:
#group by SN for top spenders
spender_groups = purchase_data_df.groupby(["SN"])
spender_groups


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x133a386d0>

In [582]:
#group top spenders by total purchase count
spender_comparison = spender_groups["Purchase ID"].count()
spender_comparison

SN
Adairialis76     1
Adastirin33      1
Aeda94           1
Aela59           1
Aelaria33        1
                ..
Yathecal82       3
Yathedeu43       2
Yoishirrala98    1
Zhisrisu83       2
Zontibe81        3
Name: Purchase ID, Length: 576, dtype: int64

In [583]:
#group top spenders by average purchase price
spender_comparison_avg = spender_groups["Price"].mean()
spender_comparison_avg

SN
Adairialis76     2.280000
Adastirin33      4.480000
Aeda94           4.910000
Aela59           4.320000
Aelaria33        1.790000
                   ...   
Yathecal82       2.073333
Yathedeu43       3.010000
Yoishirrala98    4.580000
Zhisrisu83       3.945000
Zontibe81        2.676667
Name: Price, Length: 576, dtype: float64

In [584]:
#group top spenders by total purchase value
spender_comparison_total = spender_groups["Price"].sum()
spender_comparison_total

SN
Adairialis76     2.28
Adastirin33      4.48
Aeda94           4.91
Aela59           4.32
Aelaria33        1.79
                 ... 
Yathecal82       6.22
Yathedeu43       6.02
Yoishirrala98    4.58
Zhisrisu83       7.89
Zontibe81        8.03
Name: Price, Length: 576, dtype: float64

In [585]:
#create top spenders summary table and sorting by top spenders


top_spender_summary = pd.DataFrame({"Purchase Count": spender_comparison, "Average Purchase Price": spender_comparison_avg, "Total Purchase Value": spender_comparison_total})
top_spender_summary["Average Purchase Price"] = top_spender_summary["Average Purchase Price"].map("${:,.2f}".format)
top_spender_summary["Total Purchase Value"] = top_spender_summary["Total Purchase Value"].map("${:,.2f}".format)
top_spender_summary

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adairialis76,1,$2.28,$2.28
Adastirin33,1,$4.48,$4.48
Aeda94,1,$4.91,$4.91
Aela59,1,$4.32,$4.32
Aelaria33,1,$1.79,$1.79
...,...,...,...
Yathecal82,3,$2.07,$6.22
Yathedeu43,2,$3.01,$6.02
Yoishirrala98,1,$4.58,$4.58
Zhisrisu83,2,$3.94,$7.89


In [586]:
#sorted top spender summary
organized_spenders = top_spender_summary.sort_values(["Total Purchase Value"], ascending=False).head()
organized_spenders

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Haillyrgue51,3,$3.17,$9.50
Phistym51,2,$4.75,$9.50
Lamil79,2,$4.64,$9.29
Aina42,3,$3.07,$9.22
Saesrideu94,2,$4.59,$9.18


In [603]:
#most popular items, eliminate columns for pop items, break down by item ID, item name, purchase count, item price, total purchase value
pop_items = purchase_data_df[["Item ID", "Item Name", "Price"]]
pop_items

Unnamed: 0,Item ID,Item Name,Price
0,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,143,Frenzied Scimitar,1.56
2,92,Final Critic,4.88
3,100,Blindscythe,3.27
4,131,Fury,1.44
...,...,...,...
775,60,Wolf,3.54
776,164,Exiled Doomblade,1.63
777,67,"Celeste, Incarnation of the Corrupted",3.46
778,92,Final Critic,4.19


In [604]:
#retrieve item ID, Item Name, Item price and group them together for calculations
pop_item_groups = pop_items.groupby(["Item ID", "Item Name"])
pop_item_groups.head()

Unnamed: 0,Item ID,Item Name,Price
0,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,143,Frenzied Scimitar,1.56
2,92,Final Critic,4.88
3,100,Blindscythe,3.27
4,131,Fury,1.44
...,...,...,...
764,113,Solitude's Reaver,4.07
765,130,Alpha,2.07
766,58,"Freak's Bite, Favor of Holy Might",4.14
777,67,"Celeste, Incarnation of the Corrupted",3.46


In [605]:
#most popular items for purchase count
pop_item_total = pop_item_groups["Price"].count()
pop_item_total

Item ID  Item Name                                   
0        Splinter                                         4
1        Crucifer                                         4
2        Verdict                                          6
3        Phantomlight                                     6
4        Bloodlord's Fetish                               5
                                                         ..
178      Oathbreaker, Last Hope of the Breaking Storm    12
179      Wolf, Promise of the Moonwalker                  6
181      Reaper's Toll                                    5
182      Toothpick                                        3
183      Dragon's Greatsword                              3
Name: Price, Length: 179, dtype: int64

In [606]:
#most popular items total purchase value
sum_pop_item = pop_item_groups["Price"].sum()
sum_pop_item


Item ID  Item Name                                   
0        Splinter                                         5.12
1        Crucifer                                        11.77
2        Verdict                                         14.88
3        Phantomlight                                    14.94
4        Bloodlord's Fetish                               8.50
                                                         ...  
178      Oathbreaker, Last Hope of the Breaking Storm    50.76
179      Wolf, Promise of the Moonwalker                 26.88
181      Reaper's Toll                                    8.30
182      Toothpick                                       12.09
183      Dragon's Greatsword                              3.27
Name: Price, Length: 179, dtype: float64

In [607]:
#most popular items item price
pop_item_total_purchase = sum_pop_item / pop_item_total
pop_item_total_purchase.head()

Item ID  Item Name         
0        Splinter              1.2800
1        Crucifer              2.9425
2        Verdict               2.4800
3        Phantomlight          2.4900
4        Bloodlord's Fetish    1.7000
Name: Price, dtype: float64

In [608]:
#pop item summary table

pop_item_summary = pd.DataFrame({"Purchase Count": pop_item_total, "Item Price": pop_item_total_purchase, "Total Purchase Value": sum_pop_item})
pop_item_summary["Item Price"] = pop_item_summary["Item Price"].map("${:,.2f}".format)
pop_item_summary["Total Purchase Value"] = pop_item_summary["Total Purchase Value"].map("${:,.2f}".format)
pop_item_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Splinter,4,$1.28,$5.12
1,Crucifer,4,$2.94,$11.77
2,Verdict,6,$2.48,$14.88
3,Phantomlight,6,$2.49,$14.94
4,Bloodlord's Fetish,5,$1.70,$8.50
...,...,...,...,...
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
179,"Wolf, Promise of the Moonwalker",6,$4.48,$26.88
181,Reaper's Toll,5,$1.66,$8.30
182,Toothpick,3,$4.03,$12.09


In [609]:
#sorted table for pop item by purchase count descending order

organized_items = pop_item_summary.sort_values(["Purchase Count"], ascending=False).head()
organized_items


Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92,Final Critic,13,$4.61,$59.99
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
145,Fiery Glass Crusader,9,$4.58,$41.22
132,Persuasion,9,$3.22,$28.99
108,"Extraction, Quickblade Of Trembling Hands",9,$3.53,$31.77


In [610]:
#sort by most profitable items the above table, sort by total purchase value in descending order

profitable_items = pop_item_summary.sort_values(["Total Purchase Value"], ascending=False).head()
profitable_items

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
63,Stormfury Mace,2,$4.99,$9.98
29,"Chaos, Ender of the End",5,$1.98,$9.90
173,Stormfury Longsword,2,$4.93,$9.86
38,"The Void, Vengeance of Dark Magic",4,$2.37,$9.48
143,Frenzied Scimitar,6,$1.56,$9.36
