### Analyzing free store apps
This project is intended to analyze store apps data 

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
a_file = open('AppleStore.csv')
g_file = open('googleplaystore.csv')
from csv import reader
a_list = list(reader(a_file))
g_list = list(reader(g_file))

In [3]:
explore_data(a_list, 0, 2, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 7198
Number of columns: 16


In [4]:
explore_data(g_list, 0, 2, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


In [5]:
print(g_list[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [6]:
del g_list[10473]

Let's inspect the data to see if there are duplicate entries

In [7]:
duplicate_apps = []
unique_apps = []
for app in g_list[1:]:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
print(len(duplicate_apps))
print(len(unique_apps))

1181
9659


Storing the app names with their highest number of reviews to indicate the latest updated data

In [8]:
reviews_max = {}
for app in g_list[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if (name in reviews_max and reviews_max[name] < n_reviews) or name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

9659


Removing duplicates and keeping only the latest record of every app

In [9]:
android_clean = []
already_added = []
for app in g_list[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if (reviews_max[name] == n_reviews and name not in already_added):
        android_clean.append(app)
        already_added.append(name)
print(len(android_clean))

9659


In [10]:
def is_english(word):
    non_english = 0
    for c in word:
        order = ord(c)
        if  order > 127:
            non_english += 1
            if non_english > 3:
                return False
    return True
print(is_english('Instachat 😜'))
            

True


In [11]:
english_only = []
for app in android_clean:
    if is_english(app[0]):
        english_only.append(app)

In [12]:
free = []
for app in english_only:
    if app[6] == 'Free':
        free.append(app)
print(len(free))

8863


Let's study the app genres to determine a profitable app profile

In [13]:
print(a_list[0].index('prime_genre'))

11


In [14]:
print(g_list[0].index('Genres'))

9


In [15]:
def freq_table(dataset, index):
    freq = {}
    for row in dataset:
        value = row[index]
        if value in freq:
            freq[value]+=1
        else:
            freq[value] = 1
    return freq

In [16]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [18]:
display_table(english_only, 9)

Tools : 827
Entertainment : 557
Education : 503
Business : 419
Medical : 395
Personalization : 375
Productivity : 373
Lifestyle : 363
Finance : 345
Sports : 331
Communication : 314
Action : 299
Health & Fitness : 288
Photography : 280
News & Magazines : 250
Social : 239
Travel & Local : 218
Books & Reference : 218
Shopping : 201
Simulation : 190
Arcade : 184
Dating : 170
Casual : 165
Video Players & Editors : 161
Maps & Navigation : 129
Puzzle : 119
Food & Drink : 112
Role Playing : 104
Strategy : 94
Racing : 91
Libraries & Demo : 84
Auto & Vehicles : 84
Weather : 79
House & Home : 73
Adventure : 72
Events : 64
Art & Design : 56
Comics : 54
Beauty : 53
Card : 47
Parenting : 46
Board : 42
Casino : 39
Educational;Education : 38
Trivia : 37
Educational : 37
Education;Education : 35
Casual;Pretend Play : 25
Word : 23
Music : 19
Puzzle;Brain Games : 17
Education;Pretend Play : 17
Racing;Action & Adventure : 16
Entertainment;Music & Video : 15
Board;Brain Games : 14
Arcade;Action & Adventure

In [21]:
a_list[0].index('rating_count_tot')

5

In [24]:
table = freq_table(a_list, 11)
print(table)

{'Book': 112, 'Lifestyle': 144, 'Photo & Video': 349, 'Sports': 114, 'Social Networking': 167, 'Catalogs': 10, 'Travel': 81, 'Finance': 104, 'Music': 138, 'Games': 3862, 'News': 75, 'Business': 57, 'Productivity': 178, 'Medical': 23, 'Food & Drink': 63, 'Utilities': 248, 'Weather': 72, 'Reference': 64, 'prime_genre': 1, 'Health & Fitness': 180, 'Education': 453, 'Navigation': 46, 'Shopping': 122, 'Entertainment': 535}


In [25]:
unique_genres = []
for entry in table:
    if entry not in unique_genres:
        unique_genres.append(entry)
print(unique_genres)

['Book', 'Lifestyle', 'Photo & Video', 'Sports', 'Social Networking', 'Catalogs', 'Travel', 'Finance', 'Music', 'Games', 'News', 'Business', 'Productivity', 'Medical', 'Food & Drink', 'Utilities', 'Weather', 'Reference', 'prime_genre', 'Health & Fitness', 'Education', 'Navigation', 'Shopping', 'Entertainment']


In [None]:
for 