# Profitable App Profiles

This project will help our developers understand what type of apps are likely to attack more users to an Android and iOS mobile app that is available on the Google Play and the App Store. These apps are intended to be free to download and install and the main source of revenue is in app purchases. This code will be written in Python.

In [29]:
from csv import reader

app = open('AppleStore.csv')
apple_read = reader(app)
apple = list(apple_read)
apple_head = apple[0]
apple_data = apple[1:]

goo = open('googleplaystore.csv')
goo_read = reader(goo)
google = list(goo_read)
google_head = google[0]
google_data = google[1:]

In [30]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [31]:
explore_data(apple_data, 0, 3, True)
explore_data(google_data, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2

In [32]:
print(apple_head)
print(google_head)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [33]:
print(len(google_data))
del google_data[10472]
print(len(google_data))

10841
10840


### This dataset has duplication due to the periodicity at which it is updated. The way we can tell this is by checking the ratings total, it changes incrimentally over time due to more people rating the app after installing it. We can get the most up to date entry by checking which value has the most ratings.

In [34]:
duplicate_apps = []
unique_apps = []

for app in google_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [35]:
for app in google_data:
    name = app[0]
    if name == 'Insightly CRM':
        print(app)

['Insightly CRM', 'BUSINESS', '3.8', '1383', '51M', '100,000+', 'Free', '0', 'Everyone', 'Business', 'July 12, 2018', '3.24.1', '5.0 and up']
['Insightly CRM', 'BUSINESS', '3.8', '1383', '51M', '100,000+', 'Free', '0', 'Everyone', 'Business', 'July 12, 2018', '3.24.1', '5.0 and up']


In [36]:
reviews_max = {}

for app in google_data:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews 
        
print(len(reviews_max))

9659


In [37]:
google_clean = []
already_added = []

for app in google_data:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        google_clean.append(app)
        already_added.append(name)
        
explore_data(google_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


## Check function for English characters

In [38]:
def english_word(name):
    for word in name:
        if ord(word) > 127:
            return False
    return True
            
print(english_word('Instagram'))
print(english_word('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_word('Docs To Go™ Free Office Suite'))
print(english_word('Instachat 😜'))

True
False
False
False


In [39]:
def english_word(name):
    non_english = 0
    for letter in name:
        if ord(letter) > 127:
            non_english += 1
    if non_english > 3:
        return False
    else:
        return True
            
print(english_word('Instagram'))
print(english_word('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_word('Docs To Go™ Free Office Suite'))
print(english_word('Instachat 😜'))

True
False
True
True


In [40]:
google_english = []
apple_english = []

for app in google_clean:
    name = app[0]
    if english_word(name):
        google_english.append(app)
        
for app in apple_data:
    name = app[1]
    if english_word(name):
        apple_english.append(app)
    
explore_data(apple_english, 0 ,3, True)
explore_data(google_english, 0, 3, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 6183
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies

In [47]:
apple_english
google_english

# print(apple_head)
# print(google_head)
# print(google_data)
# print(apple_data)

free_apple = []
free_google = []

for row in apple_english:
    price = row[4]
    if price == '0.0':
        free_apple.append(row)
    else:
        pass
        
for row in google_english:
    price = row[7]
    if price == '0':
        free_google.append(row)
    else:
        pass
    
print(len(free_apple))
print(len(free_google))

3222
8864


### We want to build a google play app first because we want to make sure it will be successful on one store and if it is then we can expand production and build an ios version to in theory double our revenue by taking advantage of both stores. 

In [53]:
# for x in free_google:
#     print(x[9])

In [84]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
# category = display_table(free_google, 1)
# genres = display_table(free_google, 9)
# prime_genre = display_table(free_apple, 11)

In [101]:
apple_table = freq_table(free_apple, 11)

for genre in apple_table:
    total = 0
    len_genre = 0
    for row in free_apple:
        genre_app = row[11]
        if genre_app == genre:
            usr_rating = float(row[5])
            total += usr_rating 
            len_genre += 1
    avg_rating = total / len_genre        
    print(genre, ':', avg_rating)    

Social Networking : 71548.34905660378
Photo & Video : 28441.54375
Games : 22788.6696905016
Music : 57326.530303030304
Reference : 74942.11111111111
Health & Fitness : 23298.015384615384
Weather : 52279.892857142855
Utilities : 18684.456790123455
Travel : 28243.8
Shopping : 26919.690476190477
News : 21248.023255813954
Navigation : 86090.33333333333
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Food & Drink : 33333.92307692308
Sports : 23008.898550724636
Book : 39758.5
Finance : 31467.944444444445
Education : 7003.983050847458
Productivity : 21028.410714285714
Business : 7491.117647058823
Catalogs : 4004.0
Medical : 612.0


In [107]:
# category = display_table(free_google, 1)
google_table = freq_table(free_google, 1)

for category in google_table:
    total = 0
    len_category = 0
    for row in free_google:
        category_app = row[1]
        if category_app == category:
            install = row[5]
            install = install.replace('+', '')
            install = install.replace(',', '')
            install = float(install)
            total += install 
            len_category += 1
    avg_install = total / len_category       
    print(category, ':', avg_install)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3695641.8198090694
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_

## After analyzing the data it is recommended that we create either a social media app or a gaming app because those are the most popular app types and thus more likely to be used if it is successful