# Data Analysis

The project is about analyzing data to help developers understand what type of apps are likely to attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n')

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
from csv import reader
ios_file = open('AppleStore.csv', encoding='utf8')
ios_read = reader(ios_file)
ios_data = list(ios_read)
android_file = open('googleplaystore.csv', encoding='utf8')
android_read = reader(android_file)
android_data = list(android_read)
explore_data(android_data, 0, 5, True)
explore_data(android_data, 0, 1, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type'

In [3]:
explore_data(ios_data, 0, 1, True)
explore_data(ios_data, 1, 5, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


Number of rows: 7198
Number of columns: 16
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


The previous row 10473 had errors and missing data, so it was deleted using del android_data[10473], as part of the process of data cleaning

In [10]:
del android_data[10473]
print(android_data[10473])

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


The Google Play data has duplicate entries

In [5]:
for app in android_data:
    name = app[0]
    if name == 'Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [6]:
duplicate_apps = []
unique_apps = []

for app in android_data[1:]:
    name = app[0]
    if name in unique_apps: 
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


The duplicates won't be removed randomly, the duplicate entries have different amount of reviews, meaning that the one with more reviews is the one we want to preserve, since it is the most recent. This is solved using a dictionary

In [11]:
reviews_max = {}
for app in android_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

9659


Now that we have a dictionary full of unique keys and the highest amount of reviews for each app as a value, we can store the rows in a new data set called android_clean, which length value should be the same amount as the length value of the dictionary

In [12]:
android_clean = []
already_added = []
for app in android_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(app[0])
print(len(android_clean))

9659


In [13]:
def english_check(string_check):
    counter = 0
    for character in string_check:
        if ord(character) > 127:
            counter += 1
        if counter > 3:
            return False
    return True

print(english_check('Instagram'))
print(english_check('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_check('Docs To Go™ Free Office Suite'))
print(english_check('Instachat 😜'))

True
False
True
True


In [36]:
android_language_check = []
ios_language_check = []
for app in android_clean:
    if english_check(app[0]):
        android_language_check.append(app)

for app in ios_data[1:]:
    if english_check(app[1]):
        ios_language_check.append(app)

explore_data(android_language_check, 0, 1, True)
explore_data(ios_language_check, 0, 1, True)
    

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 9614
Number of columns: 13
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 6183
Number of columns: 16


In [41]:
android_clean_data = []
ios_clean_data = []
for app in android_language_check:
    if app[7] == '0':
        android_clean_data.append(app)
for app in ios_language_check:
    if app[4] == '0.0':
        ios_clean_data.append(app)

print(len(android_clean_data))
print(len(ios_clean_data))

8864
3222


As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:

1. Build a minimal Android version of the app, and add it to Google Play.

2. If the app has a good response from users, we then develop it further.

3. If the app is profitable after six months, we also build an iOS version of the app and add it to the App Store.

The end goal is to add the apps on both Google Play and the App Store, so it's necessary to find app profiles that are successful on both markets. An app that works well in both markets is a productive app.


In [56]:
def freq_table(dataset, index):
    freq_dict = {}
    counter = 0
    for app in dataset:
        counter += 1
        if app[index] in freq_dict:
            freq_dict[app[index]] += 1
        else:
            freq_dict[app[index]] = 1
    for genre in freq_dict:
        percentage = (freq_dict[genre] / counter) * 100
        freq_dict[genre] = percentage
    return freq_dict

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [58]:
display_table(ios_clean_data, -5)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


We can see that the most common genre is 'Games', with a 58.16%, followed by 'Entertainment' with 7.88%. Most of the apps are designed for entertainment purposes, while apps for practical purposes are rare. This doesn't imply that fun apps have the most number of users, demand might not be the same as the offer

In [60]:
display_table(android_clean_data, 1) #categories

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [61]:
display_table(android_clean_data, 9) #genres

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

The most popular category in Google Play is family. Google Play has more practical apps and there is more variety within the store.

There is not a big difference between genres and categories, what we can see is that the genres table is more granular. We are looking for the big picture at the moment, so it's better to focus just on categories

In [84]:
prime_genre_freq = freq_table(ios_clean_data, -5)

for genre in prime_genre_freq:
    total = 0
    len_genre = 0
    for app in ios_clean_data:
        genre_app = app[-5]
        if genre_app == genre:
            total += float(app[5])
            len_genre += 1
    avg_user_ratings = total / len_genre
    print(genre,':',str(avg_user_ratings))


Travel : 28243.8
Utilities : 18684.456790123455
Food & Drink : 33333.92307692308
Shopping : 26919.690476190477
Reference : 74942.11111111111
Games : 22788.6696905016
Health & Fitness : 23298.015384615384
Book : 39758.5
Medical : 612.0
Catalogs : 4004.0
Photo & Video : 28441.54375
Sports : 23008.898550724636
Business : 7491.117647058823
Productivity : 21028.410714285714
Navigation : 86090.33333333333
News : 21248.023255813954
Finance : 31467.944444444445
Weather : 52279.892857142855
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Music : 57326.530303030304
Social Networking : 71548.34905660378
Education : 7003.983050847458


On average, navigation apps have the highest number of user reviews in the App Store.

In [126]:
android_category = freq_table(android_clean_data, 1)
for category in android_category:
    total = 0
    len_category = 0    
    for app in android_clean_data:
        category_app = app[1]
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+','')
            total += float(n_installs)
            len_category += 1
    avg_num_installs = total / len_category
    print(category,'',avg_num_installs)

MEDICAL  120550.61980830671
WEATHER  5074486.197183099
HEALTH_AND_FITNESS  4188821.9853479853
FOOD_AND_DRINK  1924897.7363636363
VIDEO_PLAYERS  24727872.452830188
PARENTING  542603.6206896552
LIBRARIES_AND_DEMO  638503.734939759
ART_AND_DESIGN  1986335.0877192982
MAPS_AND_NAVIGATION  4056941.7741935486
GAME  15588015.603248259
NEWS_AND_MAGAZINES  9549178.467741935
COMICS  817657.2727272727
BUSINESS  1712290.1474201474
COMMUNICATION  38456119.167247385
PHOTOGRAPHY  17840110.40229885
TRAVEL_AND_LOCAL  13984077.710144928
BEAUTY  513151.88679245283
SOCIAL  23253652.127118643
HOUSE_AND_HOME  1331540.5616438356
EDUCATION  1833495.145631068
FINANCE  1387692.475609756
ENTERTAINMENT  11640705.88235294
FAMILY  3695641.8198090694
SPORTS  3638640.1428571427
SHOPPING  7036877.311557789
DATING  854028.8303030303
AUTO_AND_VEHICLES  647317.8170731707
PERSONALIZATION  5201482.6122448975
PRODUCTIVITY  16787331.344927534
BOOKS_AND_REFERENCE  8767811.894736841
TOOLS  10801391.298666667
LIFESTYLE  1437816.

On average, communication apps have the most installations in Google Play, followed by social networking apps.