In [62]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from collections import Counter

In [42]:
if not os.path.exists("googleplaystore.csv"):
    print ("Missing dataset file")

df_google=pd.read_csv("googleplaystore.csv")
df_google.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [43]:
if not os.path.exists("AppleStore.csv"):
    print ("Missing dataset file")

df_apple=pd.read_csv("AppleStore.csv")
df_apple.head()

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,game_enab
0,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1,0
1,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1,0
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1,0
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1,0
4,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1,0


In [44]:
def clean_app1(x):
    x = str(x)
    pos = x.find('-')
    if pos == -1:
        return x
    return x[:pos].strip()

def clean_app2(x):
    x = str(x)
    pos = x.find('–')
    if pos == -1:
        return x
    return x[:pos].strip()

def clean_app3(x):
    x = str(x)
    pos = x.find(':')
    if pos == -1:
        return x
    return x[:pos].strip()

df_google['App'] = df_google['App'].map(clean_app1).map(clean_app2).map(clean_app3)
df_apple['track_name'] = df_apple['track_name'].map(clean_app1).map(clean_app2).map(clean_app3)

In [45]:
col_n = ['track_name', 'user_rating', 'prime_genre', 'id']
df_apple = pd.DataFrame(df_apple, columns = col_n)

col_n = ['App', 'Rating', 'Genres']
df_google = pd.DataFrame(df_google, columns = col_n)

df_apple.columns = ['App', 'apple_rating', 'apple_genre', 'id']
df_google.columns = ['App', 'google_rating', 'google_genre']

In [None]:
df_apple.isnull().sum()

In [None]:
df_google.isnull().sum()

In [46]:
df_apple.dropna(axis=0, how='any', inplace=True)
df_google.dropna(axis=0, how='any', inplace=True)

print(df_apple.shape)
print(df_google.shape)

(7197, 4)
(9367, 3)


In [47]:
def google_app2developer(title):
    headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
            "Accept-Encoding":"gzip",
            "Accept-Language":"zh-CN,zh;q=0.8",
            "Referer":"http://www.example.com/",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36" }

    title = str(title)
    search_url = 'https://play.google.com/store/search?q={0}&c=apps'.format(title)
    searchHtml = requests.get(search_url, headers = headers)
    soup = BeautifulSoup(searchHtml.text, features='html5lib')
    
    try:
        developer = soup.find('a',{'class': 'subtitle'}).string.strip()
        #print (developer)
        return developer
    except:
        #print (search_url)
        return None

df_google['developer'] = df_google['App'].copy()
df_google['developer'] = df_google['developer'].map(google_app2developer)

In [48]:
def apple_app2developer(id):
    headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
            "Accept-Encoding":"gzip",
            "Accept-Language":"zh-CN,zh;q=0.8",
            "Referer":"http://www.example.com/",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36" }

    search_url = 'https://itunes.apple.com/ie/app/id{0}?mt=8'.format(id)
    searchHtml = requests.get(search_url, headers = headers)
    soup = BeautifulSoup(searchHtml.text, features='html5lib')
    
    try:
        #print ('yes')
        #print(div.find_all('div', {'class': 'ember-view'}))
        div = soup.find('div', {'class': 'animation-wrapper is-visible ember-view'})
        section = div.find_all('div', {'class': 'ember-view'})[1].find('section', {'class': 'l-content-width section section--hero product-hero ember-view'})  
        developer = section.find('header').find_all('h2')[1].find('a').string.strip()
        #print (developer)
        return developer
    except:
        #print (search_url)
        return None

df_apple['id'] = df_apple['id'].map(apple_app2developer)
df_apple.columns = ['App', 'apple_rating', 'apple_genre', 'developer']

In [49]:
df_google.to_csv("df_google.csv",index=False,sep=',')
df_apple.to_csv("df_apple.csv",index=False,sep=',')

In [52]:
df_google.isnull().sum()

App                 0
google_rating       0
google_genre        0
developer        8164
dtype: int64

In [53]:
df_apple.isnull().sum()

App                0
apple_rating       0
apple_genre        0
developer       4853
dtype: int64

In [57]:
df_google = df_google.dropna(how='any')
df_apple = df_apple.dropna(how='any')

In [58]:
df_google.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1203 entries, 0 to 1289
Data columns (total 4 columns):
App              1203 non-null object
google_rating    1203 non-null float64
google_genre     1203 non-null object
developer        1203 non-null object
dtypes: float64(1), object(3)
memory usage: 47.0+ KB


In [59]:
df_apple.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2344 entries, 1 to 11023
Data columns (total 4 columns):
App             2344 non-null object
apple_rating    2344 non-null float64
apple_genre     2344 non-null object
developer       2344 non-null object
dtypes: float64(1), object(3)
memory usage: 91.6+ KB


In [81]:
df_google_grouped = df_google.groupby('developer')
df_google_mean = df_google_grouped['google_rating'].mean()
df_google_num = df_google_grouped['google_rating'].count()

df_google["main_cat"] = (df_google.groupby("developer")["google_genre"]
                         .transform(lambda x: Counter(x).most_common(1)[0][0]))
df_google_category = df_google.groupby(['developer']).head(1)['main_cat'].tolist()
df_google_developer = df_google.groupby(['developer']).head(1)['developer'].tolist()

In [82]:
google_developer=pd.concat([df_google_mean,df_google_num],axis=1)
google_developer.head()

Unnamed: 0_level_0,google_rating,google_rating
developer,Unnamed: 1_level_1,Unnamed: 2_level_1
(주)미디어윌 네트웍스,4.0,1
1&1 Mail & Media Inc,4.2,1
2 Date,4.4,1
"2D, Inc",4.1,3
2RedBeans.com,4.0,1


In [93]:
category_series = pd.Series(df_google_category, index=df_google_developer)
google_developer['category'] = category_series
google_developer.columns = ['average_rating', 'apps_num', 'category']
google_developer.head()

Unnamed: 0_level_0,average_rating,apps_num,category
developer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(주)미디어윌 네트웍스,4.0,1,Business
1&1 Mail & Media Inc,4.2,1,Communication
2 Date,4.4,1,Dating
"2D, Inc",4.1,3,Entertainment
2RedBeans.com,4.0,1,Dating


In [94]:
google_developer['category'].unique()

array(['Business', 'Communication', 'Dating', 'Entertainment',
       'Education', 'Finance', 'Food & Drink', 'Education;Education',
       'Auto & Vehicles', 'Comics', 'Beauty', 'Books & Reference',
       'Art & Design', 'Events', 'Art & Design;Creativity',
       'Health & Fitness', 'Education;Creativity',
       'Entertainment;Creativity', 'Education;Pretend Play',
       'Education;Action & Adventure', 'Entertainment;Music & Video',
       'Education;Brain Games', 'Education;Music & Video',
       'Entertainment;Brain Games', 'Comics;Creativity',
       'Art & Design;Pretend Play'], dtype=object)

In [98]:
google_developer.groupby(['apps_num']).count()

Unnamed: 0_level_0,average_rating,category
apps_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,575,575
2,118,118
3,42,42
4,23,23
5,11,11
6,6,6
7,2,2
8,1,1
9,1,1
11,1,1


In [99]:
google_developer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 781 entries, (주)미디어윌 네트웍스 to 잡코리아
Data columns (total 3 columns):
average_rating    781 non-null float64
apps_num          781 non-null int64
category          781 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 24.4+ KB


In [100]:
google_developer.isnull().sum()

average_rating    0
apps_num          0
category          0
dtype: int64

In [101]:
df_apple_grouped = df_apple.groupby('developer')
df_apple_mean = df_apple_grouped['apple_rating'].mean()
df_apple_num = df_apple_grouped['apple_rating'].count()

df_apple["main_cat"] = (df_apple.groupby("developer")["apple_genre"]
                         .transform(lambda x: Counter(x).most_common(1)[0][0]))
df_apple_category = df_apple.groupby(['developer']).head(1)['main_cat'].tolist()
df_apple_developer = df_apple.groupby(['developer']).head(1)['developer'].tolist()

In [102]:
apple_developer=pd.concat([df_apple_mean,df_apple_num],axis=1)
apple_developer.head()

Unnamed: 0_level_0,apple_rating,apple_rating
developer,Unnamed: 1_level_1,Unnamed: 2_level_1
10tons Ltd,4.75,2
12 Minute Athlete,4.5,1
1337 & Senri LLC,4.5,1
1492 Studio,4.5,1
"1661, Inc.",4.5,1


In [103]:
category_series = pd.Series(df_apple_category, index=df_apple_developer)
apple_developer['category'] = category_series
apple_developer.columns = ['average_rating', 'apps_num', 'category']
apple_developer.head()

Unnamed: 0_level_0,average_rating,apps_num,category
developer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10tons Ltd,4.75,2,Games
12 Minute Athlete,4.5,1,Health & Fitness
1337 & Senri LLC,4.5,1,Games
1492 Studio,4.5,1,Games
"1661, Inc.",4.5,1,Shopping


In [104]:
apple_developer['category'].unique()

array(['Games', 'Health & Fitness', 'Shopping', 'Finance',
       'Photo & Video', 'Education', 'Sports', 'Utilities', 'Medical',
       'Productivity', 'News', 'Business', 'Navigation', 'Entertainment',
       'Book', 'Lifestyle', 'Weather', 'Travel', 'Food & Drink',
       'Social Networking', 'Music', 'Reference', 'Catalogs'],
      dtype=object)

In [105]:
apple_developer.groupby(['apps_num']).count()

Unnamed: 0_level_0,average_rating,category
apps_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1270,1270
2,152,152
3,55,55
4,28,28
5,16,16
6,10,10
7,8,8
8,5,5
9,2,2
10,1,1


In [106]:
apple_developer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1560 entries, 10tons Ltd to 驴妈妈旅游网
Data columns (total 3 columns):
average_rating    1560 non-null float64
apps_num          1560 non-null int64
category          1560 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 48.8+ KB


In [107]:
apple_developer.isnull().sum()

average_rating    0
apps_num          0
category          0
dtype: int64