# Profitable App Profiles for the App Store and Google Play Markets!

The goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users

Our company only build apps that are free to download and install, and that are directed toward an English-speaking audience

In [4]:
from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file) #reader operator reads the csv file
android = list(read_file) #android is assigned a list of lists (the csv file's contents)
android_header = android[0] #android_header is the catergories for each column of the csv file
android = android[1:] #android is reassigned to all the app files, removing the header

### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file) #reader operator reads the csv file
ios = list(read_file) #ios is assigned a list of lists (the csv file's contents)
ios_header = ios[0] #ios_header is the catergories for each column of the csv file
ios = ios[1:] #ios is reassigned to all the app files, removing the header

In [5]:
#This function is used to see the contents of a given app, or set of apps
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end] #dataset_slice is equal to a specified set of data
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [6]:
#These are the two headers for the catergories of the data for both app stores
print("Android: ", android_header)
print('\n')
print("Ios: ", ios_header)

Android:  ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Ios:  ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


# Cleaning the data

Removing duplicate apps, keeping those with the most reviews

First Android:

In [7]:
android_reviews_max = {} #create a dictionary to keep track of which apps have the most reviews

for app in android:
    name = app[0]
    n_reviews = app[3]
    
    ##If a duplicate app is found in the dictionary that has less value than the current app, the app in the dictionary is assigned the higher value of the current app
    if name in android_reviews_max and android_reviews_max[name] < n_reviews:
        android_reviews_max[name] = n_reviews 
    
    #If the app is not in the review dictionary
    elif name not in android_reviews_max:
        android_reviews_max[name] = n_reviews #keyword is the apps name, the assigned value is the number of reviews

In [8]:
android_clean = []
android_already_added = []

for app in android:
    name = app[0]
    n_reviews = app[3]
    
    #This if statement only adds apps with the highest found reviews into android_clean
    if (android_reviews_max[name] == n_reviews) and (name not in android_already_added):
        android_clean.append(app)
        android_already_added.append(name) #already_added is used to remove duplicate apps with the same amount of reviews

Now Ios:

In [9]:
#This code has the same function as the above two code blocks, locating each app with the highest reviews, and excluding duplicates with lower or same review counts
ios_reviews_max = {}

for app in ios:
    name = app[0]
    n_reviews = app[3]
    
    if name in ios_reviews_max and ios_reviews_max[name] < n_reviews:
        ios_reviews_max[name] = n_reviews
    
    elif name not in ios_reviews_max:
        ios_reviews_max[name] = n_reviews

In [10]:
ios_clean = []
ios_already_added = []

for app in ios:
    name = app[0]
    n_reviews = app[3]
    
    if (ios_reviews_max[name] == n_reviews) and (name not in ios_already_added):
        ios_clean.append(app)
        ios_already_added.append(name)

Removing non-english apps

In [11]:
def is_english(string):
    non_ascii = 0
    
    #code finds if there are more than three non-ascii characters
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    #return false if there are more than three non-english characters
    if non_ascii > 3:
        return False
    else:
        return True

In [12]:
android_english = []
ios_english = []

#add all apps that have english characters (three or less non-ascii characters)
for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)       
for app in ios:
    name = app[1]
    if is_english(name):
        ios_english.append(app)

Isolate the free apps

In [13]:
android_final = []
ios_final = []

#If the price which is in the 7th index of the app list is '0', add to final list set
for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)
#If the price which is in the 4th index of the app list is '0', add to final list set  
for app in ios_english:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)

# Building Frequency Tables

Frequency Table Functions

In [55]:
#freq_table takes the final android apps list as the 'dataset' parameter and the index for the intended catergory as the 'index' parameter
def freq_table(dataset, index): 
    table = {}
    total = 0
    
    #loops through each app in the given dataset
    for row in dataset:
        total += 1 #total is incremented by 1
        value = row[index] #'value' is assigned the given catergory value using the specified index number
        #assigns 'table' dictionary a keyword of the value
        #the value is assigned a 1, or is incremented by 1 if that value is found in the dictionary
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    #loops through the keywords in 'table' dictionary
    for key in table:
        percentage = (table[key] / total) * 100
        #assigns 'table_percentages' dictionary a keyword of the given value
        #assigns the value keyword the percentage of how frequent that value is in it's given data catergory
        table_percentages[key] = percentage 
        #organizes the dictionary to be sorted by it's percentage from highest to lowest
        t = sorted(table_percentages.items(), key=lambda x: x[1], reverse= True)
    
    #returns table_percentages as a tuple sorted by which catergory is the most frequent in descending order
    return t

#display_table uses the freq_table function
def display_table(dataset, index):
    #table is set to the table_percentages tuple returned from 'freq_table'
    table = freq_table(dataset, index)
    #loops through each tuple and prints each catergory and value
    for entry in table:
        print(entry[0], ':', entry[1])

The headers, just for reference

In [13]:
print("Android: ", android_header)
print('\n')
print("Ios: ", ios_header)

Android:  ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Ios:  ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


Here's an example of freq_table printing the catergories in high to low order of how frequent each unique catergory is in the dataset:

In [15]:
print(freq_table(android_final, 8))

[('Everyone', 81.42631460167004), ('Teen', 11.058451816745656), ('Mature 17+', 4.1864139020537126), ('Everyone 10+', 3.2723990069961637), ('Adults only 18+', 0.033852403520649964), ('Unrated', 0.022568269013766643)]


Here's an example of display_table printing each catergory and it's frequency out of each data catergory:

In [27]:
display_table(android_final, 8)

Everyone : 81.42631460167004
Teen : 11.058451816745656
Mature 17+ : 4.1864139020537126
Everyone 10+ : 3.2723990069961637
Adults only 18+ : 0.033852403520649964
Unrated : 0.022568269013766643


Using the display table function, you are able to find the most frequent values for each catergory of the apps in the given appstore

Using this data, say your target audience is teenagers and you are seeking which apps have the highest ratings

You can find out which Teen apps have the highest ratings, using a for loop

In [54]:
teen_ratings = {}
#loops through each app in android_final
for app in android_final:
    #if the app has a 'Teen' content rating and the rating is not 'Nan', add the app and the rating to a dictionary
    if app[8] == 'Teen' and app[2] != 'NaN':
        if float(app[2]) > 4.5:
            teen_ratings[app[0]] = app[2]
        
#sort the apps by highest ratings and print each one out
ratings = sorted(teen_ratings.items(), key=lambda x: x[1], reverse= True)
for app in ratings:
    print(app)

('Spine- The dating app', '5.0')
('Eternal Light AG', '5.0')
('Jobs in Canada - Emplois au Canada', '5.0')
('A-Y Collection', '5.0')
('Railroad Radio Vancouver BC', '5.0')
('Movement BE', '5.0')
('CL Notifier', '5.0')
('Foothills CP', '5.0')
('DN Blog', '5.0')
('chat dz', '5.0')
('i am EB', '5.0')
('UP EB Bill Payment & Details', '5.0')
('EC Calgary', '5.0')
('EF Events', '5.0')
('EJ messenger', '5.0')
('Hum Ek Hain 2.02', '5.0')
('Jigsaw Volvo FH 16 Trucks', '5.0')
('Wallpapers FN SCAR H', '5.0')
('Fr. Daoud Lamei', '5.0')
('Mummatikabalkuragi', '4.9')
('Down Dog: Great Yoga Anywhere', '4.9')
('EXO-L Amino for EXO Fans', '4.9')
('BT Church', '4.9')
('DC Comics Amino', '4.9')
('Eddsworld Amino', '4.9')
('Canvas FL', '4.9')
('Free Books - Spirit Fanfiction and Stories', '4.8')
('SoloLearn: Learn to Code for Free', '4.8')
('Eternium', '4.8')
('KPOP Amino for K-Pop Entertainment', '4.8')
("PewDiePie's Tuber Simulator", '4.8')
('Undertale AU Amino', '4.8')
('Catalyst AZ', '4.8')
('WebComic

# Conclusion

Using for loops similar to the one above, you are able to explore through the data of individual apps and find information pertaining to your company's interests