# Andriod and IOS App Project
Our Goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users. To do this we will collect and analyze data about mobile apps available on Google play and the app store

In [1]:
apple_opened_file = open('AppleStore.csv')
from csv import reader
apple_file = reader(apple_opened_file)
apple_data = list(apple_file)
apple_header = apple_data[0]
apple_data = apple_data[1:]

In [2]:
google_opened_file = open('googleplaystore.csv')
from csv import reader
google_file = reader(google_opened_file)
google_data = list(google_file)
google_header = google_data[0]
google_data = google_data[1:]

In [3]:
#dataset is expected to be a list of lists
#start and end are expected to be ints and represent the starting and 
#ending indices of the sliced data
#rows_and_columns is expected to be a boolean and has false as the
#defualt argument 
def explore_data(dataset, start, end, rows_and_columns = False):
    dataset_slice = dataset[start:end] #slices the data 
    for row in dataset_slice: #loops through the slice and prints each iteration
        print(row)
        print('\n') #adds a new (empty) line fter each row
    
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [4]:
print(apple_header)
print('\n')
explore_data(apple_data, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


In [5]:
print(google_header)
print('\n')
explore_data(google_data, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [6]:
print(google_data[10472])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [7]:
del google_data[10472]

In the following code I am going to show how the google data set has duplicates and will be eventually cleaning the data to remove any unneccesary data

In [8]:
duplicate_google_apps = []
unique_google_apps = []

for app in google_data:
    name = app[0]
    if name in unique_google_apps:
        duplicate_google_apps.append(name)
    else:
        unique_google_apps.append(name)
print('Number of duplicate apps:', len(duplicate_google_apps))
print('Number of Unique apps:', len(unique_google_apps))
        

Number of duplicate apps: 1181
Number of Unique apps: 9659



We don't want to count certain apps more than once when we analyze data, so we need to remove the duplicate entries and keep only one entry per app. One thing we could do is remove the duplicate rows randomly, but we could probably find a better way.

If you examine the rows we printed two cells above for the Instagram app, the main difference happens on the fourth position of each row, which corresponds to the number of reviews. The different numbers show that the data was collected at different times. We can use this to build a criterion for keeping rows. We won't remove rows randomly, but rather we'll keep the rows that have the highest number of reviews because the higher the number of reviews, the more reliable the ratings.

# Part 2 of data cleaning 
we will be creating a dictionary where each key is a unique app and only save the app with the highest review count

In [9]:
reviews_max = {}
for app in google_data:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and (reviews_max[name] < n_reviews):
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
        

In [10]:
print('Actual Results:', len(reviews_max))
print('Expected Results:', len(google_data) - 1181)

Actual Results: 9659
Expected Results: 9659


We will now be removing duplicate rows of our data set
Below are the steps to create 2 different types of data
1) create an empty dictionary (use [])
2) loop through your data set
3) set your ints

In [11]:
android_clean = []
already_added = []

for app in google_data:
    name  = app[0]
    n_reviews = float(app[3])
    
    if (n_reviews == reviews_max[name]) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

Here we are exploring the clean data

In [12]:
explore_data(android_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


# Removing Non-English Apps:
we will build a function that removes any none english apps

In [13]:
def is_english(string):
    for character in string:
        #ord is a built in function that interpretes chars as #'s
        if ord(character) > 127: 
           return False 
    return True

Practice using the code:

In [14]:
Insta = is_english('Instagram')
print(Insta)

True


This code is the same as the one above but we added another conditional statement: 

In [15]:
def is_english(string):
    non_ascii = 0
    for character in string:
        #ord is a built in function that interpretes chars as #'s
        if ord(character) > 127: 
            non_ascii += 1
    if non_ascii > 3:
        return False
    else:
        return True 

This is a practice 

In [16]:
print(is_english('Docs To Go™ Free Office Suite'))

True


This is creating a function that filters out non-english apps from both data sets

In [17]:
android_english = []
apple_english = []

for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)

for app in apple_data:
    name = app[0]
    if is_english(name):
        apple_english.append(app)
explore_data(apple_english,0,3,True)
explore_data(android_english,0,3,True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies

This code is isolating the free apps in seperate lists

In [18]:
apple_final = []
android_final = []

for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)

for app in apple_english:
    price = app[4]
    if price == '0.0':
        apple_final.append(app)
print(len(apple_final))
print(len(android_final))

4056
8864


# this code is used to make freuqency tables

In [19]:

def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [20]:
display_table(apple_final, -5)

Games : 55.64595660749507
Entertainment : 8.234714003944774
Photo & Video : 4.117357001972387
Social Networking : 3.5256410256410255
Education : 3.2544378698224854
Shopping : 2.983234714003945
Utilities : 2.687376725838264
Lifestyle : 2.3175542406311638
Finance : 2.0710059171597637
Sports : 1.947731755424063
Health & Fitness : 1.8737672583826428
Music : 1.6518737672583828
Book : 1.6272189349112427
Productivity : 1.5285996055226825
News : 1.4299802761341223
Travel : 1.3806706114398422
Food & Drink : 1.0601577909270217
Weather : 0.7642998027613412
Reference : 0.4930966469428008
Navigation : 0.4930966469428008
Business : 0.4930966469428008
Catalogs : 0.22189349112426035
Medical : 0.19723865877712032


In [21]:
display_table(android_final, -4)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

# Need to relearn this area 

In [22]:
genres_ios = freq_table(apple_final, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    for app in apple_final:
        genre_app = app[-5]
        if genre_app == genre:            
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)



Health & Fitness : 19952.315789473683
Navigation : 25972.05
Business : 6367.8
Entertainment : 10822.961077844311
Music : 56482.02985074627
Sports : 20128.974683544304
Productivity : 19053.887096774193
Book : 8498.333333333334
Reference : 67447.9
Utilities : 14010.100917431193
Photo & Video : 27249.892215568863
Travel : 20216.01785714286
Social Networking : 53078.195804195806
Education : 6266.333333333333
Lifestyle : 8978.308510638299
Finance : 13522.261904761905
Food & Drink : 20179.093023255813
Shopping : 18746.677685950413
Catalogs : 1779.5555555555557
Weather : 47220.93548387097
Games : 18924.68896765618
News : 15892.724137931034
Medical : 459.75


In [24]:
for app in apple_final:
    if app[-5] == 'Navigation':
        print(app[1], ':', app[5]) # print name and number of ratings

Waze - GPS Navigation, Maps & Real-time Traffic : 345046
Google Maps - Navigation & Transit : 154911
Geocaching® : 12811
CoPilot GPS – Car Navigation & Offline Maps : 3582
高德地图（精准专业的手机地图） : 1040
百度地图-智能的手机导航，公交地铁出行必备 : 1014
百度地图HD : 771
ImmobilienScout24: Real Estate Search in Germany : 187
ナビタイムの乗り換え案内 - 遅延情報やバス時刻表を案内するアプリ : 48
高德地图HD : 26
Railway Route Search : 5
NAVIRO(ナビロー) - カーナビ/バイクナビ/徒歩ナビが使える高性能ナビアプリ : 0
ホラースポット-ghost spot-意味が分かると怖いマップ : 0
MapFan(マップファン) – 渋滞情報/オービス/オフライン対応の本格カーナビ : 0
JR東日本アプリ : 0
えほう - 最強の恵方コンパス : 0
バーチャル恵方巻【節分・恵方コンパス・方位】 : 0
恵方コンパス. : 0
ナビタイム ドライブサポーター - NAVITIMEのカーナビアプリ : 0
自転車ナビ by NAVITIME(ナビタイム) - 自転車のナビができるアプリ : 0


In [25]:
categories_android = freq_table(android_final, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

EDUCATION : 1833495.145631068
BEAUTY : 513151.88679245283
WEATHER : 5074486.197183099
FAMILY : 3695641.8198090694
ENTERTAINMENT : 11640705.88235294
HOUSE_AND_HOME : 1331540.5616438356
HEALTH_AND_FITNESS : 4188821.9853479853
PHOTOGRAPHY : 17840110.40229885
SHOPPING : 7036877.311557789
COMMUNICATION : 38456119.167247385
PERSONALIZATION : 5201482.6122448975
LIFESTYLE : 1437816.2687861272
SOCIAL : 23253652.127118643
ART_AND_DESIGN : 1986335.0877192982
FOOD_AND_DRINK : 1924897.7363636363
PARENTING : 542603.6206896552
MEDICAL : 120550.61980830671
TOOLS : 10801391.298666667
LIBRARIES_AND_DEMO : 638503.734939759
FINANCE : 1387692.475609756
BUSINESS : 1712290.1474201474
SPORTS : 3638640.1428571427
AUTO_AND_VEHICLES : 647317.8170731707
COMICS : 817657.2727272727
BOOKS_AND_REFERENCE : 8767811.894736841
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_MAGAZINES : 9549178.467741935
MAPS_AND_NAVIGATION : 4056941.7741935486
PRODUCTIVITY : 16787331.344927534
TRAVEL_AND_LOCAL : 13984077.710144928
GAME : 155

In [26]:
for app in android_final:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
imo beta free calls and text : 100,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Messenger – Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Skype - free IM & video calls : 1,000,000,000+
Who : 100,000,000+
GO SMS Pro - Messenger, Free Themes, Emoji : 100,000,000+
LINE: Free Calls & Messages : 500,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Firefox Browser fast & private : 100,000,000+
UC Browser - Fast Download Private & Secure : 500,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Messenger Lite: Free Calls & Messages : 100,000,000+
Kik : 100,000,000+
KakaoTalk: Free Calls & Text : 100,000,000+
Opera Mini - fast web browser : 100,000,000+
Opera Browser: Fast and Secure : 100,000,000+
Telegram : 100,000,000+
Truecaller: Caller ID, SMS spam blocking & Dialer : 100,000,000+
UC Browser Mini -Tiny Fast Private & Secure : 100,000,000+
Viber Mess