# Data Profile for Android and iOS moblie apps
This projects represent profitable mobile app profiles that are profitable for the App Store and Google Play markets to analyze and understand what kinds of apps are likely to attract more users.

### Store dataset of Android and iOs

In [1]:
#function for opening and exploring dataset

def explore_data(dataset, start, end, rows_and_column=False):
    dataset_slice = dataset[start:end] #select each data from whole dataset
    for row in dataset_slice:
        print(row) #show selected data
        print('\n') # add a new empty line after each row
    if rows_and_column: #if input True, it will show number of row and column
        print('Number of rows:',len(dataset[1:]))
        print('Number of columns:',len(dataset[0]))

In [2]:
#open, read and store data

opened_file1 = open('AppleStore.csv') #open iOs data
from csv import reader
dataset_ios = list(reader(opened_file1)) # store iOs data
opened_file1.close() #close iOs data
opened_file2 = open('googleplaystore.csv') #open Android data
dataset_android = list(reader(opened_file2)) #store Android data
opened_file2.close() #close Android data

In [3]:
#print the columns and the first row and find the number of rows and columns

#iOs
explore_data(dataset_ios,0,2,rows_and_column=True)

#Android
explore_data(dataset_android,0,2,rows_and_column=True)


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 7197
Number of columns: 16
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


### Data Cleaning
#### PART 1: Deleting Wrong data
Maximum ratings is 5.0 so we need to exclude the data which contains out of range ratings.

In [4]:
#iOs
count_wrong_ios = 0
for data in dataset_ios[1:]:
    rating1 = float(data[7])
    rating2 = float(data[8])
    if rating1>5 or rating2>5: #maximum range is 5
        dataset_ios.remove(data) #delete data which is out of range
        count_wrong_ios += 1 #count wrong data

#Android
count_wrong_android = 0
for data in dataset_android[1:]:
    rating = float(data[2])
    if rating>5: #maximum range is 5
        dataset_android.remove(data) #delete data which is out of range
        count_wrong_android += 1 #count wrong data

print('Number of wrong data of iOs: ',count_wrong_ios)
print('Number of wrong data of Android: ',count_wrong_android)

Number of wrong data of iOs:  0
Number of wrong data of Android:  1


#### PART 2: Deleting duplicate data

In [5]:
#Android has duplicated app so decide to store only maximum review

#function for updating review point
def reviews_max(reviews_app, dataset, index_app, index_review):
    for data in dataset[1:]:
        app = data[index_app]
        reviews = int(data[index_review])
        if app in reviews_app and reviews> reviews_app[app] : #compare value in list
            reviews_app[app] = reviews
        elif app not in reviews_app: #add new value in list
            reviews_app[app] = reviews

#function for removing duplicated app having review point which is less than maximum
def unique_app(reviews_app, dataset, index_app, index_review, duplicate_app, unique_app):
    for data in dataset[1:]:
        app = data[index_app]
        reviews = int(data[index_review])
        if app in reviews_app and reviews < reviews_app[app]: #review point is less than maximum
            if app in duplicate_app:
                duplicate_app[app] += 1 #count more duplicated app
            else:
                duplicate_app[app] = 1 #add the first value of duplicated app
            dataset.remove(data) #delete duplicated app
        elif app in reviews_app and reviews == reviews_app[app]: #review point is equal to maximum
            if app not in unique_app:
                unique_app.append(app)
            else: #if it's duplicated app then count and delete
                if app in duplicate_app:
                    duplicate_app[app] += 1
                else:
                    duplicate_app[app] = 1
                dataset.remove(data)

#iOs
reviews_ios = {}
reviews_max(reviews_ios, dataset_ios, 0, 5) #ios app is identified from id, not app name
unique_ios = []
duplicate_ios = {}
unique_app(reviews_ios, dataset_ios, 1, 5, duplicate_ios, unique_ios)
count_duplicate_ios = 0
for key in duplicate_ios:
    count_duplicate_ios += int(duplicate_ios[key])
print('Total duplicate app of iOs: ',count_duplicate_ios)

#Android
reviews_android = {}
reviews_max(reviews_android, dataset_android, 0, 3)
unique_android = []
duplicate_android = {}
unique_app(reviews_android, dataset_android, 0, 3, duplicate_android, unique_android)
count_duplicate_android = 0
for key in duplicate_android:
    count_duplicate_android += int(duplicate_android[key])
print('Total duplicate app of Android: ',count_duplicate_android)

print('Updated the number of iOs: ',len(dataset_ios[1:]) )
print('Updated the number of Android: ',len(dataset_android[1:]) )

Total duplicate app of iOs:  0
Total duplicate app of Android:  1181
Updated the number of iOs:  7197
Updated the number of Android:  9659


#### PART 2:
only focus on free downloaded and installed app and for an English-speaking audience only.
So we will need to do the following:
- remove non-English apps
- remove apps that aren't free

##### Remove non-English apps
its name has more than 3 non-ASCII characters

note:
<br>**orc(char)function**
: return an integer representing the unicode character as details below
- 0-31 = control code
- 32-47 = punctuation and symbol
- 48-57 = number
- 58-64 = punctuation and symbol
- 65-90 = alphabet uppercase
- 91-96 = punctuation and symbol
- 97-122 = alphabet lowercase
- 123-126 = punctuation and symbol

In [6]:
#function for checking if the app is for english-user or not
def is_english(string):
    non_ascii = 0
    for character in string:
        if ord(character) > 127:
            non_ascii += 1 #count the number of non-ascii this app name having
    if non_ascii > 3:
        return False #more than 3 returns false
    else:
        return True

In [7]:
#iOs
non_eng_ios = []
for data in dataset_ios[1:]:
    app = data[1]
    if not(is_english(app)): #check if this app is for english-user 
        non_eng_ios.append(app)
        dataset_ios.remove(data) #if not then remove
print('Total non-English app of iOs: ',len(non_eng_ios))

#Android
non_eng_android = []        
for data in dataset_android[1:]:
    app = data[0]
    if not(is_english(app)):
        non_eng_android.append(app)
        dataset_android.remove(data)
print('Total non-English app of Android: ',len(non_eng_android))

print('Updated the number of iOs: ',len(dataset_ios[1:]) )
print('Updated the number of Android: ',len(dataset_android[1:]) )

Total non-English app of iOs:  1014
Total non-English app of Android:  45
Updated the number of iOs:  6183
Updated the number of Android:  9614


#### Remove apps that aren't free

In [8]:
#iOs
non_free_ios = []
for data in dataset_ios[1:]:
    app = data[1]
    price = float(data[4])
    if price != 0.0: #price is not zero
        non_free_ios.append(app)
        dataset_ios.remove(data)
print('Total non-free app of iOs: ',len(non_free_ios))

#Android
non_free_android = []
for data in dataset_android[1:]:
    app = data[0]
    price = data[7]
    if price != '0': #if use data index 7 == 'Free', there is one app that price is equal to 0 but type is not free it's AnA
        non_free_android.append(app)
        dataset_android.remove(data)
print('Total non-free app of Android: ',len(non_free_android))

print('Updated the number of iOs: ',len(dataset_ios[1:]) )
print('Updated the number of Android: ',len(dataset_android[1:]) )

Total non-free app of iOs:  2961
Total non-free app of Android:  750
Updated the number of iOs:  3222
Updated the number of Android:  8864


## Explore the app data

### 1. The most three app genre


note:
<br>**sorted() fuction**
sorted(iterable, key=key, reverse=reverse)
- iterable	: Required. The sequence to sort, list, dictionary, tuple etc.
- key	: Optional. A Function to execute to decide the order. Default is None
<br>ex. key = lambda x: x[1] <- 1 = index in tuple, if not put default is 0
- reverse	: Optional. A Boolean. False will sort ascending, True will sort descending. Default is False

In [9]:
#function for finding percentage of each genre
def freq_table(dataset, index):
    table = {} #dict for collecting genre and its number
    total = 0
    for row in dataset[1:]:
        total += 1 #counting total app
        value = row[index] #collect type of genre
        if value in table:
            table[value] += 1 #count more app in genre
        else:
            table[value] = 1 #add new type of genre in dict
    table_percentages = {} #dict for collecting genre and its percentage
    for key in table:
        percentage = (table[key]/total)*100
        table_percentages[key] = percentage
    return table_percentages

#function for sorting data
def display_table(dataset, index):
    table = freq_table(dataset,index)
    table_display = []
    for key in table:
        key_val_as_tuple = (key,table[key]) #collect data in tupe which is easier for sorting
        table_display.append(key_val_as_tuple)
    table_sorted = sorted(table_display,key = lambda x: x[1], reverse=True)
    for entry in table_sorted[0:3]:
        print(entry[0],' : ',entry[1])

In [10]:
#iOS
print("The most three app genre in iOs")
display_table(dataset_ios,-5)

The most three app genre in iOs
Games  :  58.16263190564867
Entertainment  :  7.883302296710118
Photo & Video  :  4.9658597144630665


In [11]:
#Android
print("The most three app genre in Android")
display_table(dataset_android,-4)

The most three app genre in Android
Tools  :  8.449909747292418
Entertainment  :  6.069494584837545
Education  :  5.347472924187725


### 2. Top three popular app genre

In [12]:
#function for calculating average of ratings of each genre
def popular_table(dataset, ios):
    genre_num = {}
    genre_rating = {}
    if ios:
        index_genre = -5
        index_rating = 5
    else:
        index_genre = -4
        index_rating = 5
    for data in dataset[1:]:
        genre = data[index_genre]
        if ios:
            rating = float(data[index_rating])
        else: #rating in Android shows as string having , and + which can't be calculated
            rating = data[index_rating].replace(',','') #remove ,
            rating = rating.replace('+','') #remove +
            rating = float(rating) #change to float type
        if genre in genre_num: #count number of genre
            genre_num[genre] += 1
        else:
            genre_num[genre] = 1
        if genre in genre_rating: #sum rating of each genre
            genre_rating[genre] += rating
        else:
            genre_rating[genre] = rating
    genre_avg = {}
    for key1 in genre_rating: #calculate average rating of each genre
        genre0 = key1
        rating0 = genre_rating[key1]
        for key2 in genre_num:
            genre1 = key2
            tot = genre_num[key2]
            if key2 == key1:
                average = rating0/tot
                genre_avg[key2] = average
    return genre_avg

#function for sorting data
def popular_table_sort(dataset,ios):
    avg_table = popular_table(dataset,ios)
    avg_table_sort = []
    for key in avg_table:
        value = (key,avg_table[key])
        avg_table_sort.append(value)
    avg_table_sort = sorted(avg_table_sort, key=lambda x: x[1], reverse = True)
    for item in avg_table_sort[:3]:
        print(item[0],' : ',item[1])

In [13]:
#iOs
print('the top three popular app genre in iOs:')
popular_table_sort(dataset_ios,ios=True)

the top three popular app genre in iOs:
Navigation  :  86090.33333333333
Reference  :  74942.11111111111
Social Networking  :  71548.34905660378


In [14]:
#function for showing top three app in each genre
def each_app_rank(dataset,string,ios):
    rank_app = []
    if ios:
        index_data = 1
        index_genre = -5
        index_rating = 5
    else:
        index_data = 0
        index_genre = -4
        index_rating = 5
    for data in dataset[1:]:
        app = data[index_data]
        genre = data[index_genre]
        if ios:
            rating = float(data[index_rating])
        else:
            rating = data[index_rating].replace(',','')
            rating = rating.replace('+','')
            rating = float(rating)
        if genre == string:
            rank_app.append((app,rating))
    rank_app = sorted(rank_app,key=lambda x: x[1],reverse=True)
    for rank in rank_app[:3]:
        print(rank[0],' : ',rank[1])


In [15]:
print('the top three popular app of "Navigation" in iOs:')
each_app_rank(dataset_ios,'Navigation',ios=True)

the top three popular app of "Navigation" in iOs:
Waze - GPS Navigation, Maps & Real-time Traffic  :  345046.0
Google Maps - Navigation & Transit  :  154911.0
Geocaching®  :  12811.0


In [16]:
print('the top three popular app of "Reference" in iOs:')
each_app_rank(dataset_ios,'Reference',ios=True)

the top three popular app of "Reference" in iOs:
Bible  :  985920.0
Dictionary.com Dictionary & Thesaurus  :  200047.0
Dictionary.com Dictionary & Thesaurus for iPad  :  54175.0


In [17]:
print('the top three popular app of "Social Networking" in iOs:')
each_app_rank(dataset_ios,'Social Networking',ios=True)

the top three popular app of "Social Networking" in iOs:
Facebook  :  2974676.0
Pinterest  :  1061624.0
Skype for iPhone  :  373519.0


In [18]:
#Android
print('the top three popular app genre in Android:')
popular_table_sort(dataset_android,ios=False)

the top three popular app genre in Android:
Communication  :  38456119.167247385
Adventure;Action & Adventure  :  35333333.333333336
Video Players & Editors  :  24947335.796178345


In [19]:
print('the popular app of "Communication" in Android which is sold out more than 1 million :')
for app in dataset_android[1:]:
    if app[-4] == 'Communication' and (app[5] == '1,000,000,000+'):
        print(app[0], ':', app[5])

the popular app of "Communication" in Android which is sold out more than 1 million :
WhatsApp Messenger : 1,000,000,000+
Messenger – Text and Video Chat for Free : 1,000,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Skype - free IM & video calls : 1,000,000,000+


In [20]:
print('the popular app of "Action & Adventure" in Android which is sold out more than 1 hundred thousands :')
for app in dataset_android[1:]:
    if app[-4] == 'Adventure;Action & Adventure' and (app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

the popular app of "Action & Adventure" in Android which is sold out more than 1 hundred thousands :
ROBLOX : 100,000,000+


In [21]:
print('the popular app of "Video Players & Editors" in Android which is sold out more than 1 million :')
for app in dataset_android[1:]:
    if app[-4] == 'Video Players & Editors' and (app[5] == '1,000,000,000+'):
        print(app[0], ':', app[5])

the popular app of "Video Players & Editors" in Android which is sold out more than 1 million :
YouTube : 1,000,000,000+
Google Play Movies & TV : 1,000,000,000+


# Conclusion

 **Table 1 : represent ranking of the three most number of genre**

|Rank| iOs | Android |
|:---|:----:|---:|
|1 | Games|Tools|
|2|Entertainment|Entertainment|
|3|Photo & Video|Education|

 **Table 2 : represent ranking of the three most popular genre**

|Rank| iOs | Android |
|:---|:----:|---:|
|1 | Navigation|Communication|
|2|Reference|Action & Adventure|
|3|Social Network|Video players & editor|