# Data Analysis Project for App Store
- The goal of this project is to analyze data that will help the hypothetical developers understand what type of apps are likely to attract more users

In [70]:
from csv import reader
def open_file(file, list_name):
    with open(file, 'r', encoding="utf8") as opened_file:
        read_file = reader(opened_file)
        for line in read_file:
            list_name.append(line)

In [71]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [72]:
apple = []
google = []
open_file('AppleStore.csv', apple)
open_file('googleplaystore.csv', google)

In [73]:
explore_data(apple, 1, 3, True)


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


In [74]:
explore_data(google, 1, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


Below we will createa function to remove inaccurate data (rows with missing data)

In [75]:
def delete_error_row():
    for row in google[1:]:
        header = google[0]
        if len(row) != len(header):
            del google[google.index(row)]
        
delete_error_row()

def check_for_faulty_row():
    for row in google[1:]:
        header = google[0]
        if len(row) != len(header):
            print('There are still faulty rows')
    print('There are no faulty rows')

check_for_faulty_row()

There are no faulty rows


We will now check if the Google Play dataset has multiple entries of the same data and collect those duplicates if so

In [76]:
duplicate = []
unique = []

for row in google:
    if row in unique:
        duplicate.append(row)
    else:
        unique.append(row)
        
print(f'Number of duplicate apps: {len(duplicate)}')
print('Some duplicate apps: ', duplicate[1:10])

Number of duplicate apps: 483
Some duplicate apps:  [['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Google My Business', 'BUSINESS', '4.4', '70991', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 24, 2018', '2.19.0.204537701', '4.4 and up'], ['ZOOM Cloud Meetings', 'BUSINESS', '4.4', '31614', '37M', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 20, 2018', '4.1.28165.0716', '4.0 and up'], ['join.me - Simple Meetings', 'BUSINESS', '4.0', '6989', 'Varies with device', '1,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 16, 2018', '4.3.0.508', '4.4 and up'], ['Box', 'BUSINESS', '4.2', '159872', 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Business', 'July 31, 2018', 'Varies with device', 'Varies with device'], ['Zenefits', 'BUSINESS', '4.2', '296', '14M', '50,000+', 'Free', '0', 'Everyone', 'Busine

Next, we will remove the duplicate apps from the dataset but instead of removing random duplicates, we will keep the one with the highest rating and delete the rest

In [77]:
reviews_max =  {}
for row in google[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

print('Expect dictionary length is 9,659')
print(f'Dictionary length is {len(reviews_max)}')
        

Expect dictionary length is 9,659
Dictionary length is 9659


Below, we will use two lists and a loop to separate unique apps from there duplicates. 

In [78]:
android_clean = []
already_added = []
for row in google[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if (n_reviews == reviews_max[name]) and (name not in already_added):
        android_clean.append(row)
        already_added.append(name)

print('Expected length for android_clean dataset: 9659')
print(f'android_clean has {len(android_clean)} rows')

Expected length for android_clean dataset: 9659
android_clean has 9659 rows


We are only analysing apps that are designed for English speaking audiences and thus, we will remove the apps that aren't designed for English speakers.

First we will create a function to check if most characters in a piece of text are common English characters. For now the limit of non English characters is 3

In [79]:
def common_english_char(char):
    if ord(char) <=127:
        return True
    else:
        return False

def english_or_not(text):
    limit = 3
    non_english_count = 0
    for char in text:
        if common_english_char(char) == False:
            non_english_count += 1
    if non_english_count > limit:
        return False
    else: 
        return True

# Testing that the functions work as intended
print(english_or_not('Instagram'))
print(english_or_not('Áà±Â•áËâ∫PPS -„ÄäÊ¨¢‰πêÈ¢Ç2„ÄãÁîµËßÜÂâßÁÉ≠Êí≠'))
print(english_or_not('Docs To Go‚Ñ¢ Free Office Suite'))
print(english_or_not('Instachat üòú'))


True
False
True
True


We will now use the function to filter out non-English apps from both datasets by creating new lists that contain only the English apps

In [80]:
english_only_apple = []
english_only_google = []
print(f'Apple dataset rows before non-english filter: {len(apple)}')
print(f'Google Play data set before non-english filter: {len(android_clean)}')
print('\n')

for row in apple:
    name = row[0]
    if english_or_not(name):
        english_only_apple.append(row)

for row in android_clean:
    name = row[0]
    if english_or_not(name):
        english_only_google.append(row)
print('Apple')
explore_data(english_only_apple,1,3, True)
print('\n')
print('Google')
explore_data(english_only_google, 1,3, True)

Apple dataset rows before non-english filter: 7198
Google Play data set before non-english filter: 9659


Apple
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


Google
['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


Because we only build apps that are free to download, we will isolate our free apps from the dataset

In [85]:
free_apple_apps = []
free_google_apps = []

def free_apps(list_to_read_from, list_to_append_to, price_index):
    for row in list_to_read_from[1:]:
        free_or_not = row[price_index]
        if free_or_not == '0' or free_or_not == '0.0':
            list_to_append_to.append(row)
            
            
free_apps(english_only_apple, free_apple_apps, 4)
free_apps(english_only_google, free_google_apps, 7)

print(f'Free Apple apps dataset rows: {len(free_apple_apps)}')
print(f'Free Google Play apps dataset rows: {len(free_google_apps)}')

Free Apple apps dataset rows: 4056
Free Google Play apps dataset rows: 8863


Below, we are going to create a frequency table to get a sense of the most common app genres on both the Apple Store and the Google Play store as the end goal is to add a new app on both. 

The frequency table should help us understand which apps are succesful on both markets

The frequency for each genre will be displayed as a percentage of all the genres in the table

In [86]:
# print(apple[1][11])
import pprint
def freq_table(dataset, index):
        table = {}
        total_number_of_apps = 0
        for row in dataset[1:]:
            total_number_of_apps += 1
            if row[index] in table:
                table[row[index]] += 1
            else:
                table[row[index]] = 1
                
        table_percentages = {}
        
        for key in table:
            value = (table[key]/total_number_of_apps)* 100
            table_percentages[key] = value
            
        return table_percentages
print('Apple - Prime Genre Column')
print('\n')
pprint.pprint(freq_table(free_apple_apps, 11))
print('\n')
print('Google - Genres Column')
print('\n')
pprint.pprint(freq_table(free_google_apps, 9))
        

Apple - Prime Genre Column


{'Book': 1.627620221948212,
 'Business': 0.4932182490752158,
 'Catalogs': 0.22194821208384713,
 'Education': 3.255240443896424,
 'Entertainment': 8.236744759556105,
 'Finance': 2.0715166461159065,
 'Food & Drink': 1.060419235511714,
 'Games': 55.659679408138096,
 'Health & Fitness': 1.8742293464858202,
 'Lifestyle': 2.318125770653514,
 'Medical': 0.19728729963008632,
 'Music': 1.6522811344019728,
 'Navigation': 0.4932182490752158,
 'News': 1.4303329223181258,
 'Photo & Video': 4.1183723797780525,
 'Productivity': 1.528976572133169,
 'Reference': 0.4932182490752158,
 'Shopping': 2.9839704069050557,
 'Social Networking': 3.501849568434032,
 'Sports': 1.9482120838471024,
 'Travel': 1.381011097410604,
 'Utilities': 2.688039457459926,
 'Weather': 0.7644882860665845}


Google - Genres Column


{'Action': 3.1031369893929135,
 'Action;Action & Adventure': 0.1015572105619499,
 'Adventure': 0.6770480704129994,
 'Adventure;Action & Adventure': 0.033852403520649964,
 '

In [87]:
# function copied from instructions as instructed

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
# already done above in my own way but just following instructions here
print('Apple apps frequency in percentages')
print('\n')
display_table(free_apple_apps, 11)
print('\n')
print('Google Play apps frequency in percentages')
print('\n')
display_table(free_google_apps, 9)

Apple apps frequency in percentages


Games : 55.659679408138096
Entertainment : 8.236744759556105
Photo & Video : 4.1183723797780525
Social Networking : 3.501849568434032
Education : 3.255240443896424
Shopping : 2.9839704069050557
Utilities : 2.688039457459926
Lifestyle : 2.318125770653514
Finance : 2.0715166461159065
Sports : 1.9482120838471024
Health & Fitness : 1.8742293464858202
Music : 1.6522811344019728
Book : 1.627620221948212
Productivity : 1.528976572133169
News : 1.4303329223181258
Travel : 1.381011097410604
Food & Drink : 1.060419235511714
Weather : 0.7644882860665845
Reference : 0.4932182490752158
Navigation : 0.4932182490752158
Business : 0.4932182490752158
Catalogs : 0.22194821208384713
Medical : 0.19728729963008632


Google Play apps frequency in percentages


Tools : 8.451816745655607
Entertainment : 6.070864364703228
Education : 5.348679756262695
Business : 4.5926427443015125
Productivity : 3.8930264048747465
Lifestyle : 3.8930264048747465
Finance : 3.7011961182577298

Though we can now see what type of apps are the most common on both online stores, what might be more useful is knowing which genres have the most users or get the most downloads rather than which genres of apps occurs most frequently on the app stores


In [93]:
the_apple_apps = freq_table(free_apple_apps, 11)
user_ratings = {}
for genre in the_apple_apps:
    total = 0
    len_genre = 0
    for row in free_apple_apps:
        genre_app = row[11]
        if genre_app == genre:
            num_user_ratings = float(row[5])
            total += num_user_ratings
            len_genre += 1
    avg_num_ratings = total/len_genre
    user_ratings[genre] = avg_num_ratings
    print(f'{genre}: {avg_num_ratings}')

Photo & Video: 27249.892215568863
Games: 18924.68896765618
Music: 56482.02985074627
Social Networking: 53078.195804195806
Reference: 67447.9
Health & Fitness: 19952.315789473683
Weather: 47220.93548387097
Utilities: 14010.100917431193
Travel: 20216.01785714286
Shopping: 18746.677685950413
News: 15892.724137931034
Navigation: 25972.05
Lifestyle: 8978.308510638299
Entertainment: 10822.961077844311
Food & Drink: 20179.093023255813
Sports: 20128.974683544304
Book: 8498.333333333334
Finance: 13522.261904761905
Education: 6266.333333333333
Productivity: 19053.887096774193
Business: 6367.8
Catalogs: 1779.5555555555557
Medical: 459.75


In [110]:
highest_avg_num_ratings = max(user_ratings.values())
most_popular_app_name = ''

for k,v in user_ratings.items():
    if v == highest_avg_num_ratings:
        most_popular_app_name = k
        
print(most_popular_app_name, ':', highest_avg_num_ratings)


Reference : 67447.9


From the above it looks like Reference apps are the most popular type of apps and most downloaded type of app