# Android & IOS App Data Set
Project Summary: Finds apps that are popular among users. Specifically, our aim is to help our developers understand what type of apps are likely to attract more users on Google Play and the App Store

In [1]:
from csv import reader
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)

opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

explore_data(ios, 0, 3, True)
print("--------------------------------------------------------\n")
explore_data(android, 0,2, True)


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16
--------------------------------------------------------

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns

In [2]:
android[10473] #Error on this row
# So, let's delete this row

del android[10473] #deleted bad row

In [3]:
unique = []
duplicate = []


for rows in android:
    app_name = rows[0]
    rev_count = rows[3]
    if app_name in unique:
        
        duplicate.append(app_name)
    else:
        unique.append(app_name)
if len(duplicate) > 0:
    count = int(len(duplicate))
    print("There are " + str(count) + " duplicate " +" rows in this data set.")
    print("\n Example duplicates: \n")
    print(duplicate[:3])
    print("\n I won't remove the duplicate rows randomly, but rather, will use the latest rating value.")
else: 
    print("There are no duplicate rows in this data set.")


There are 1181 duplicate  rows in this data set.

 Example duplicates: 

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business']

 I won't remove the duplicate rows randomly, but rather, will use the latest rating value.


In [4]:
reviews_max = {}

for key in android[1:]:
    name = key[0]
    n_reviews = float(key[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews

android_clean = []
already_added = []

for row in android[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(row)
        already_added.append(name)

    
explore_data(android_clean, 0,2, True)    
    

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 9659
Number of columns: 13


In [5]:
def non_asci_check(string1):
    count = 0
    for char in string1:
        if ord(char) > 127:
            count +=1
            
    if count > 3:
        return False
            
    else:            
        return True
        
non_asci_check('Instagram')
non_asci_check('爱奇艺PPS -《欢乐颂2》电视剧热播')
non_asci_check('Instachat 😜')

ios_clean = []

for row in ios[1:]:
    ios_clean.append(row)

def asci_filter(data_set):
    clean_set = []
    for row in data_set:
        name = row[0]
        if non_asci_check(name) == True:
            clean_set.append(row)
    return clean_set

def asci_filter_ios(data_set):
    clean_set = []
    for row in data_set:
        name = row[1]
        if non_asci_check(name) == True:
            clean_set.append(row)
    return clean_set

android_clean = asci_filter(android_clean)

ios_clean = asci_filter_ios(ios_clean)

In [6]:
print("ANDROID DATA: ")
explore_data(android_clean, 0, 1, True)
print("\n")
print("IOS DATA: ")
explore_data(ios_clean, 0, 1, True)

ANDROID DATA: 
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 9614
Number of columns: 13


IOS DATA: 
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 6183
Number of columns: 16


In [7]:
free_android = []
for value in android_clean:
    price = value[6]
    if price == 'Free':
        free_android.append(value)
        
free_ios = []
for value in ios_clean:
    price = float(value[4])
    if price <= 0.0:
        free_ios.append(value)

android_clean = free_android        
ios_clean = free_ios

print("ANDROID DATA: ")
explore_data(android_clean, 0, 1, True)
print("\n")
print("IOS DATA: ")
explore_data(ios_clean, 0, 1, True)

ANDROID DATA: 
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 8863
Number of columns: 13


IOS DATA: 
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 3222
Number of columns: 16


As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:

    Build a minimal Android version of the app, and add it to Google Play.
    If the app has a good response from users, we develop it further.
    If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.



In [8]:
def freq_table(dataset, index):
    freq_list = {}
    count = 0
    for row in dataset:
        count += 1
        column = row[index]
        if column in freq_list:
            freq_list[column] += 1
        else:
            freq_list[column] = 1
            
    table_percent = {}
    for key in freq_list:
        percentage = (freq_list[key]/count)*100
        table_percent[key] = percentage
        
    return table_percent
    
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
print("ANDROID:")
display_table(android_clean, 1)
display_table(android_clean, 9)
print('\n')
print("IOS:")
display_table(ios_clean, 11)

ANDROID:
FAMILY : 18.898792733837304
GAME : 9.725826469592688
TOOLS : 8.462146000225657
BUSINESS : 4.592124562789123
LIFESTYLE : 3.9038700214374367
PRODUCTIVITY : 3.8925871601038025
FINANCE : 3.7007785174320205
MEDICAL : 3.5315355974275078
SPORTS : 3.396141261423897
PERSONALIZATION : 3.317161232088458
COMMUNICATION : 3.2381812027530184
HEALTH_AND_FITNESS : 3.0802211440821394
PHOTOGRAPHY : 2.944826808078529
NEWS_AND_MAGAZINES : 2.798149610741284
SOCIAL : 2.6627552747376737
TRAVEL_AND_LOCAL : 2.335552296062281
SHOPPING : 2.245289405393208
BOOKS_AND_REFERENCE : 2.1437436533904997
DATING : 1.8616721200496444
VIDEO_PLAYERS : 1.7939749520478394
MAPS_AND_NAVIGATION : 1.399074805370642
FOOD_AND_DRINK : 1.241114746699763
EDUCATION : 1.1621347173643235
ENTERTAINMENT : 0.9590432133589079
LIBRARIES_AND_DEMO : 0.9364774906916393
AUTO_AND_VEHICLES : 0.9251946293580051
HOUSE_AND_HOME : 0.8236488773552973
WEATHER : 0.8010831546880289
EVENTS : 0.7108202640189552
PARENTING : 0.6544059573507841
ART_AND_D

The frequency tables we analyzed on the above cell showed us that the App Store is dominated by apps designed for fun, while Google Play shows a more balanced landscape of both practical and fun apps. 

In [9]:
freq_table(ios_clean, 11)

{'Social Networking': 3.2898820608317814,
 'Photo & Video': 4.9658597144630665,
 'Games': 58.16263190564867,
 'Music': 2.0484171322160147,
 'Reference': 0.5586592178770949,
 'Health & Fitness': 2.0173805090006205,
 'Weather': 0.8690254500310366,
 'Utilities': 2.5139664804469275,
 'Travel': 1.2414649286157666,
 'Shopping': 2.60707635009311,
 'News': 1.3345747982619491,
 'Navigation': 0.186219739292365,
 'Lifestyle': 1.5828677839851024,
 'Entertainment': 7.883302296710118,
 'Food & Drink': 0.8069522036002483,
 'Sports': 2.1415270018621975,
 'Book': 0.4345127250155183,
 'Finance': 1.1173184357541899,
 'Education': 3.662321539416512,
 'Productivity': 1.7380509000620732,
 'Business': 0.5276225946617008,
 'Catalogs': 0.12414649286157665,
 'Medical': 0.186219739292365}

In [10]:
genres_ios = freq_table(ios_clean, 11)
total = 0
len_genre = 0
for genre in genres_ios:
    total += 1
    for app in ios_clean:
        genre_app = app[11]
    
        if genre_app == genre:
            len_genre += 1
            num_ratings = float(app[5])
            total += num_ratings
    avg_rating = total/len_genre
    print(genre, ':', avg_rating)
    
        

Social Networking : 71548.35849056604
Photo & Video : 45619.45112781955
Games : 25626.514953271027
Music : 26574.929283771533
Reference : 26966.39073741007
Health & Fitness : 26862.22149410223
Weather : 27169.38411739318
Utilities : 26882.779399499585
Travel : 26905.109926168992
Shopping : 26905.5959555908
News : 26810.752046783626
Navigation : 26949.094515752626
Lifestyle : 26745.574752097637
Entertainment : 25622.557371349096
Food & Drink : 25691.646450723638
Sports : 25629.34129922585
Book : 25695.60904522613
Finance : 25764.395895398873
Education : 25059.16247212488
Productivity : 24988.514241001565
Business : 24895.90691158157
Catalogs : 24869.92226368159
Medical : 24824.749534450653


In [11]:
android_category = freq_table(android_clean, 1)

for category in android_category:
    total = 0
    len_category = 0
    for app in android_clean:
        category_app = app[1]
        if category_app == category:
            install_num = app[5]
            install_num = install_num.replace('+', '')
            install_num = install_num.replace(',', '')
            install_num = float(install_num)
            total += install_num
            len_category += 1
    avg_installs = total/len_category
    print(category_app,  ' : ', avg_installs)
        

LIFESTYLE  :  1986335.0877192982
LIFESTYLE  :  647317.8170731707
LIFESTYLE  :  513151.88679245283
LIFESTYLE  :  8767811.894736841
LIFESTYLE  :  1712290.1474201474
LIFESTYLE  :  817657.2727272727
LIFESTYLE  :  38456119.167247385
LIFESTYLE  :  854028.8303030303
LIFESTYLE  :  1833495.145631068
LIFESTYLE  :  11640705.88235294
LIFESTYLE  :  253542.22222222222
LIFESTYLE  :  1387692.475609756
LIFESTYLE  :  1924897.7363636363
LIFESTYLE  :  4188821.9853479853
LIFESTYLE  :  1331540.5616438356
LIFESTYLE  :  638503.734939759
LIFESTYLE  :  1437816.2687861272
LIFESTYLE  :  15588015.603248259
LIFESTYLE  :  3697848.1731343283
LIFESTYLE  :  120550.61980830671
LIFESTYLE  :  23253652.127118643
LIFESTYLE  :  7036877.311557789
LIFESTYLE  :  17840110.40229885
LIFESTYLE  :  3638640.1428571427
LIFESTYLE  :  13984077.710144928
LIFESTYLE  :  10801391.298666667
LIFESTYLE  :  5201482.6122448975
LIFESTYLE  :  16787331.344927534
LIFESTYLE  :  542603.6206896552
LIFESTYLE  :  5074486.197183099
LIFESTYLE  :  24727872.

n this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.

We concluded that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets. The markets are already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.