# Profitable App Profiles across App Store and Google Play Markets
#### The goal of the project is to gain insights on exiting app products and understand what type of apps are likely to attract more users.

#### Apple Store data:
Source of data: 

Data documentation:

In [1]:
# open apple store file
open_file = open("AppleStore.csv", encoding = 'UTF-8')
from csv import reader
read_file = reader(open_file)
apple_app = list(read_file)


In [2]:
#open Google Play file 
open_file = open("googleplaystore.csv", encoding = 'UTF-8')
from csv import reader
read_file = reader(open_file)
google_app = list(read_file)


In [3]:
def explore_data(dataset, start, end, rows_and_columns=True):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [4]:
explore_data(apple_app, 0, 3, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


In [47]:
explore_data(google_app, 0, 3, rows_and_columns=True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [6]:
# row 10473 (including header) has a missing category value, therefore we delete the row 
# del google_app[10473]
print(google_app[10472:10474])

[['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up'], ['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']]


In [7]:
# create a function to detect if there is any duplicate entry
def duplicate_detect (data_set, index):
    non_duplicate_data = []
    duplicate_data = []
    for row in data_set:
        app_name = row[index]
        if app_name not in non_duplicate_data:
            non_duplicate_data.append(app_name)
        else:
            duplicate_data.append(app_name)      
    return duplicate_data

In [8]:
# duplication in google_app
duplicate_data = duplicate_detect(google_app, 0)

In [9]:
print("The number of duplicate entry is ", len(duplicate_data))

The number of duplicate entry is  1181


In [10]:
print('Example of cuplicate data: ', duplicate_data[:6])

Example of cuplicate data:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box']


In [11]:
def duplicate_verification(data_set, name):
    result = []
    for row in google_app:
        app_name = row[0] 
        if app_name == name:
            result.append(row)
    return result

In [12]:
# We plan to remove duplicate entries by using number of reviews as criterion -- 
# We found that some of the duplicate entries have different value in the forth column, number of review. 
# Therefore, we will keep the entry for the same app with the most reviews and remove the older ones. 
# Below is a demonstration: 
print(duplicate_verification(google_app, 'Instagram'))

[['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']]


In [13]:
# create a function to collect the index of duplicate enties
def duplicate_remove (data_set, index1, index2):
    row_to_keep = {}
    row_to_remove = []
    for row in data_set:
        app_name = row[index1]
        review_count = int(row[index2])
        row_number = data_set.index(row)
        # print('review count ', review_count)
        if app_name not in row_to_keep:
            row_to_keep[app_name] = [review_count, row_number]
            # print('row_to_keep is ', row_to_keep)
        else:
            if review_count > row_to_keep[app_name][0]:
                # print('current review count is ', row_to_keep[app_name][0])
                row_to_remove.append(data_set[row_to_keep[app_name][1]])
                row_to_keep[app_name] = [review_count, row_number]
                # print('row to remove update', row_to_remove)
            else:
                row_to_remove.append(row)
                # print('row to remove ', row_to_remove)
    return row_to_keep

In [14]:
# test the duplicate remove function with a smaller data set 
data_set = [['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']]

In [15]:
# test the duplicate remove function with a smaller data set 
duplicate_remove(data_set, 0, 3)

{'Instagram': [66577446, 1]}

In [16]:
google_to_keep = duplicate_remove(google_app[1:], 0, 3)

In [17]:
len(google_to_keep)

9659

In [18]:
def deduplicate (data_set, row_to_keep):
    data_clean = []
    entered = set()
    for row in data_set:
        app_name = row[0]
        review_count = int(row[3])
        #print(row_to_keep[app_name][0])
        if app_name not in entered and review_count == int(row_to_keep[app_name][0]):
            data_clean.append(row)
            entered.add(app_name)
    return data_clean
            

In [19]:
google_clean = deduplicate(google_app[1:], google_to_keep)

In [20]:
len(google_clean)

9659

In [21]:
# check if there's any duplicate left 
duplicate_data_1 = duplicate_detect(google_clean, 0)
print(len(duplicate_data_1))

0


In [22]:
def check_char(data_set, index):
    data_clean_foreign = set()
    for row in data_set:
        char_count = 0
        for char in row[index]:
                if ord(char) > 127:
                    char_count += 1
        if char_count > 3:
            data_clean_foreign.add(row[0])
                    
    return data_clean_foreign
                

In [23]:
google_en_set = check_char(google_clean, 0)

In [24]:
apple_en_set = check_char(apple_app, 1)

In [25]:
def delete_foreign_char(data_set, index, foreign_set):
    data_clean = []
    for row in data_set:
        if row[0] not in foreign_set:
            data_clean.append(row)
    return data_clean

In [26]:
google_clean_en = delete_foreign_char(google_clean, 0, google_en_set)
len(google_clean_en)

9614

In [27]:
apple_clean_en = delete_foreign_char(apple_app, 0, apple_en_set)
len(apple_clean_en)

6184

In [28]:
# seperate free apps 
def free_app(data_sets, index, free):
    data_free = []
    data_not_free = []
    for row in data_sets:
        price = row[index]
        if price == free:
            data_free.append(row)
        else:
            data_not_free.append(row)
    return data_free
            

In [29]:
google_clean_en_free = free_app(google_clean_en, 7, '0')
len(google_clean_en_free)

8864

In [30]:
apple_clean_en_free = free_app(apple_clean_en, 4, '0.0')
len(apple_clean_en_free)

3222

#### Google Play App Profile

In [121]:
# Create a function to profile genres. 

def genre_profile(data_sets, index1, index2, index3, index4):
    genre_has = set()
    genre_count = {}
    genre_avg_review = {}
    genre_avg_rating = {}
    genre_avg_other = {}
    for row in data_sets:
        genre = row[index1]
        review_count = int(row[index2])
        if row[index3] == 'NaN':
            rating = 0
        elif row[index3] != 'NaN':
            rating = float(row[index3])
        other = row[index4].replace('+', '')
        other = other.replace(',', '')
        other = float(other)
        if genre not in genre_has:
            genre_has.add(genre)
            genre_count[genre] = 1
            genre_avg_review[genre] = review_count
            genre_avg_rating[genre] = rating 
            genre_avg_other[genre] = other
        else:
            genre_count[genre] += 1
            genre_avg_review[genre] += review_count
            genre_avg_rating[genre] += rating
            genre_avg_other[genre] += other
            
    for genre in genre_avg_review:
        genre_avg_review[genre] /= genre_count[genre]
        
    for genre in genre_avg_rating:
        genre_avg_rating[genre] /= genre_count[genre]
    
    for genre in genre_avg_other:
        genre_avg_other[genre] /= genre_count[genre]
            
    return genre_count, genre_avg_review, genre_avg_rating, genre_avg_other

In [130]:
google_genre_profile = genre_profile(google_clean_en_free, 1, 3, 2, 5)

In [99]:
def sort_helper (dictionary):
    display_table = []
    for key in dictionary:
        tuple_item = (round(dictionary[key],1), key)
        display_table.append(tuple_item)
    sorted_table = sorted(display_table, reverse = True)
#     for entry in sorted_table:
#         print(entry[1], ':', entry[0])
    return sorted_table

In [100]:
google_genre_count_sorted = sort_helper (google_genre_profile[0])
print(google_genre_count_sorted[:10])

[(1676, 'FAMILY'), (862, 'GAME'), (750, 'TOOLS'), (407, 'BUSINESS'), (346, 'LIFESTYLE'), (345, 'PRODUCTIVITY'), (328, 'FINANCE'), (313, 'MEDICAL'), (301, 'SPORTS'), (294, 'PERSONALIZATION')]


In [101]:
google_genre_review_sorted = sort_helper (google_genre_profile[1])
print(google_genre_review_sorted[:10])

[(995608.5, 'COMMUNICATION'), (965831.0, 'SOCIAL'), (683523.8, 'GAME'), (425350.1, 'VIDEO_PLAYERS'), (404081.4, 'PHOTOGRAPHY'), (305732.9, 'TOOLS'), (301752.2, 'ENTERTAINMENT'), (223887.3, 'SHOPPING'), (181122.3, 'PERSONALIZATION'), (171250.8, 'WEATHER')]


In [102]:
google_genre_rating_sorted = sort_helper (google_genre_profile[2])
print(google_genre_rating_sorted[:10])

[(4.3, 'EDUCATION'), (4.2, 'ART_AND_DESIGN'), (4.1, 'ENTERTAINMENT'), (4.0, 'PHOTOGRAPHY'), (4.0, 'GAME'), (4.0, 'COMICS'), (3.9, 'WEATHER'), (3.8, 'SHOPPING'), (3.7, 'VIDEO_PLAYERS'), (3.7, 'FAMILY')]


In [131]:
google_genre_install_sorted = sort_helper (google_genre_profile[3])
print(google_genre_install_sorted[:10])

[(38456119.2, 'COMMUNICATION'), (24727872.5, 'VIDEO_PLAYERS'), (23253652.1, 'SOCIAL'), (17840110.4, 'PHOTOGRAPHY'), (16787331.3, 'PRODUCTIVITY'), (15588015.6, 'GAME'), (13984077.7, 'TRAVEL_AND_LOCAL'), (11640705.9, 'ENTERTAINMENT'), (10801391.3, 'TOOLS'), (9549178.5, 'NEWS_AND_MAGAZINES')]


#### App Store App Profile 

In [122]:
apple_genre_profile = genre_profile(apple_clean_en_free, 11, 5, 7, 10)

In [123]:
apple_genre_count_sorted = sort_helper (apple_genre_profile[0])
print(apple_genre_count_sorted[:10])

[(1874, 'Games'), (254, 'Entertainment'), (160, 'Photo & Video'), (118, 'Education'), (106, 'Social Networking'), (84, 'Shopping'), (81, 'Utilities'), (69, 'Sports'), (66, 'Music'), (65, 'Health & Fitness')]


In [124]:
apple_genre_review_sorted = sort_helper (apple_genre_profile[1])
print(apple_genre_review_sorted[:10])

[(86090.3, 'Navigation'), (74942.1, 'Reference'), (71548.3, 'Social Networking'), (57326.5, 'Music'), (52279.9, 'Weather'), (39758.5, 'Book'), (33333.9, 'Food & Drink'), (31467.9, 'Finance'), (28441.5, 'Photo & Video'), (28243.8, 'Travel')]


In [125]:
apple_genre_rating_sorted = sort_helper (apple_genre_profile[2])
print(apple_genre_rating_sorted[:10])

[(4.1, 'Catalogs'), (4.0, 'Shopping'), (4.0, 'Productivity'), (4.0, 'Games'), (4.0, 'Business'), (3.9, 'Photo & Video'), (3.9, 'Music'), (3.8, 'Navigation'), (3.8, 'Health & Fitness'), (3.7, 'Reference')]


In [128]:
apple_genre_other_sorted = sort_helper (apple_genre_profile[3])
print(apple_genre_other_sorted[:10])

[(13.2, 'Medical'), (11.4, 'News'), (10.0, 'Social Networking'), (8.1, 'Entertainment'), (8.0, 'Lifestyle'), (7.9, 'Music'), (7.7, 'Reference'), (7.7, 'Book'), (7.3, 'Sports'), (7.2, 'Catalogs')]


In [118]:
test = apple_app[2][10].replace('+', '')
test = test.replace(',', '')
test = float(test)
type(test)
test

12.0