#  Title
## Subtitle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
apple_store = pd.read_csv("AppleStore.csv")
google_store = pd.read_csv("googleplaystore.csv")

## Apple overview

In [3]:
print(apple_store.head())

          id               track_name  size_bytes currency  price  \
0  284882215                 Facebook   389879808      USD    0.0   
1  389801252                Instagram   113954816      USD    0.0   
2  529479190           Clash of Clans   116476928      USD    0.0   
3  420009108               Temple Run    65921024      USD    0.0   
4  284035177  Pandora - Music & Radio   130242560      USD    0.0   

   rating_count_tot  rating_count_ver  user_rating  user_rating_ver      ver  \
0           2974676               212          3.5              3.5     95.0   
1           2161558              1289          4.5              4.0    10.23   
2           2130805               579          4.5              4.5  9.24.12   
3           1724546              3842          4.5              4.0    1.6.2   
4           1126879              3594          4.0              4.5    8.4.1   

  cont_rating        prime_genre  sup_devices.num  ipadSc_urls.num  lang.num  \
0          4+  Social Ne

In [4]:
print(apple_store.describe())

                 id    size_bytes        price  rating_count_tot  \
count  7.197000e+03  7.197000e+03  7197.000000      7.197000e+03   
mean   8.631310e+08  1.991345e+08     1.726218      1.289291e+04   
std    2.712368e+08  3.592069e+08     5.833006      7.573941e+04   
min    2.816565e+08  5.898240e+05     0.000000      0.000000e+00   
25%    6.000937e+08  4.692275e+07     0.000000      2.800000e+01   
50%    9.781482e+08  9.715302e+07     0.000000      3.000000e+02   
75%    1.082310e+09  1.819249e+08     1.990000      2.793000e+03   
max    1.188376e+09  4.025970e+09   299.990000      2.974676e+06   

       rating_count_ver  user_rating  user_rating_ver  sup_devices.num  \
count       7197.000000  7197.000000      7197.000000      7197.000000   
mean         460.373906     3.526956         3.253578        37.361817   
std         3920.455183     1.517948         1.809363         3.737715   
min            0.000000     0.000000         0.000000         9.000000   
25%            1.

## Google overview

In [5]:
print(google_store.head())

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

In [6]:
print(google_store.describe())

            Rating
count  9367.000000
mean      4.193338
std       0.537431
min       1.000000
25%       4.000000
50%       4.300000
75%       4.500000
max      19.000000


## Comparing columns

In [7]:
print(apple_store.columns)

Index(['id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')


In [8]:
print(google_store.columns)

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


The analysis is going to focus on free apps, and the end of the cleaning process is going to be a merge of these two datasets. So based on the column analysis these are the key <span style="color:green">**properties to keep**</span> in order to have a consistent merged file:
- Origin (Apple or Google)
- App Name
- App Size (bytes)
- Count of total Ratings
- Ratings (total)
- Genre/Prime Genre
- Price

Here's the explanation of why other properties will be <span style="color:red">**dropped**</span>:
- **Category** - That is only present in Google Play Store and its a broader description than Genres, so I keep the last one to be more specific.
- **Installs** - It would be lovely to have this data from Apple Store, but as I only have it for Google, we'll count the users through the ratings which are present in both.
- **Related to Versions** - Apple is more specific in splitting data from previous versions, to current version. But as our objective is to merge, we will just use the Total property.
- **Related to Devices/Softwares** - Although both relatively have info about it, the conclusions we aim are not related to anything technical, so this will be discarted.

## Cleaning Apple Store data

In [9]:
apple_crop = apple_store.copy()[["id", "track_name", "prime_genre", "size_bytes", "price", "rating_count_tot", "user_rating"]]
apple_crop.head()

Unnamed: 0,id,track_name,prime_genre,size_bytes,price,rating_count_tot,user_rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [10]:
apple_crop.columns = ["id", "app_name", "genre", "size_bytes", "price", "rating_count", "rating"]
apple_crop.head()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [11]:
apple_crop["price"].value_counts()

0.00      4056
0.99       728
2.99       683
1.99       621
4.99       394
3.99       277
6.99       166
9.99        81
5.99        52
7.99        33
14.99       21
19.99       13
8.99         9
24.99        8
11.99        6
29.99        6
13.99        6
12.99        5
15.99        4
17.99        3
59.99        3
27.99        2
23.99        2
22.99        2
20.99        2
49.99        2
16.99        2
39.99        2
249.99       1
74.99        1
18.99        1
99.99        1
21.99        1
34.99        1
299.99       1
47.99        1
Name: price, dtype: int64

In [12]:
apple_free = apple_crop.copy()[apple_crop['price'] == 0]
print(apple_free.shape)
apple_free.head()

(4056, 7)


Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [13]:
apple_free.describe()

Unnamed: 0,id,size_bytes,price,rating_count,rating
count,4056.0,4056.0,4056.0,4056.0,4056.0
mean,899164700.0,147935700.0,0.0,19749.8,3.376726
std,258370900.0,208901400.0,0.0,97744.28,1.644807
min,281796100.0,767126.0,0.0,0.0,0.0
25%,700890800.0,54041340.0,0.0,22.0,3.0
50%,1014135000.0,99600380.0,0.0,466.0,4.0
75%,1091131000.0,161198600.0,0.0,5450.75,4.5
max,1188376000.0,3148421000.0,0.0,2974676.0,5.0


In [14]:
apple_free.isna().describe()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
count,4056,4056,4056,4056,4056,4056,4056
unique,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False
freq,4056,4056,4056,4056,4056,4056,4056


In [15]:
apple_free.duplicated("app_name", keep=False).value_counts()

False    4052
True        4
dtype: int64

In [16]:
apple_free[apple_free.duplicated("app_name", keep=False)]

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
2948,1173990889,Mannequin Challenge,Games,109705216,0.0,668,3.0
4442,952877179,VR Roller Coaster,Games,169523200,0.0,107,3.5
4463,1178454060,Mannequin Challenge,Games,59572224,0.0,105,4.0
4831,1089824278,VR Roller Coaster,Games,240964608,0.0,67,3.5


In [17]:
apple_free.sort_values("rating_count", ascending=False, inplace=True)
apple_unique = apple_free.drop_duplicates(subset="app_name", keep="first", inplace=False, ignore_index=True)
apple_unique.duplicated("app_name", keep=False).value_counts()

False    4054
dtype: int64

In [18]:
apple_unique[apple_unique['app_name'] == "VR Roller Coaster"]

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
2625,952877179,VR Roller Coaster,Games,169523200,0.0,107,3.5


In [19]:
apple_unique.head()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [55]:
apple_final = apple_unique.copy()[["app_name", "genre", "size_bytes", "rating_count", "rating"]]
apple_final.head()

Unnamed: 0,app_name,genre,size_bytes,rating_count,rating
0,Facebook,Social Networking,389879808,2974676,3.5
1,Instagram,Photo & Video,113954816,2161558,4.5
2,Clash of Clans,Games,116476928,2130805,4.5
3,Temple Run,Games,65921024,1724546,4.5
4,Pandora - Music & Radio,Music,130242560,1126879,4.0


In [56]:
apple_final['genre'].unique()

array(['Social Networking', 'Photo & Video', 'Games', 'Music',
       'Reference', 'Health & Fitness', 'Weather', 'Utilities', 'Travel',
       'Shopping', 'News', 'Navigation', 'Lifestyle', 'Entertainment',
       'Food & Drink', 'Sports', 'Book', 'Finance', 'Education',
       'Productivity', 'Business', 'Catalogs', 'Medical'], dtype=object)

## Cleaning Google Play Store data

In [21]:
google_crop = google_store.copy()[["App", "Genres", "Size", "Price", "Reviews", "Rating"]]
google_crop.head()

Unnamed: 0,App,Genres,Size,Price,Reviews,Rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19M,0,159,4.1
1,Coloring book moana,Art & Design;Pretend Play,14M,0,967,3.9
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8.7M,0,87510,4.7
3,Sketch - Draw & Paint,Art & Design,25M,0,215644,4.5
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2.8M,0,967,4.3


In [22]:
google_crop.columns = ["app_name", "genre", "size_bytes", "price", "rating_count", "rating"]
google_crop.head()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19M,0,159,4.1
1,Coloring book moana,Art & Design;Pretend Play,14M,0,967,3.9
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8.7M,0,87510,4.7
3,Sketch - Draw & Paint,Art & Design,25M,0,215644,4.5
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2.8M,0,967,4.3


In [23]:
google_crop['price'].value_counts()

0         10040
$0.99       148
$2.99       129
$1.99        73
$4.99        72
          ...  
$1.75         1
$14.00        1
$4.85         1
$46.99        1
$1.04         1
Name: price, Length: 93, dtype: int64

In [24]:
try:
    google_crop['price'] = google_crop['price'].str.replace('$', '', regex=False).astype(float)
except:
    print("There's some price value which is not a price")

There's some price value which is not a price


In [25]:
google_crop.loc[~google_crop['price'].str.startswith("$"), 'price'].value_counts()

0           10040
Everyone        1
Name: price, dtype: int64

In [26]:
google_crop[google_crop['price'] == 'Everyone']

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
10472,Life Made WI-Fi Touchscreen Photo Frame,"February 11, 2018","1,000+",Everyone,3.0M,19.0


In [27]:
google_store.iloc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                              3.0M
Size                                               1,000+
Installs                                             Free
Type                                                    0
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [28]:
google_crop.dtypes

app_name         object
genre            object
size_bytes       object
price            object
rating_count     object
rating          float64
dtype: object

Add photo of Google Play Store

In [29]:
google_crop.loc[10472, "genre"] = "Lifestyle"
google_crop.loc[10472, "size_bytes"] = "3.0M"
google_crop.loc[10472, "price"] = "0"
google_crop.loc[10472, "rating_count"] = "19"
google_crop.loc[10472, "rating"] = 1.9
google_crop.loc[10472]

app_name        Life Made WI-Fi Touchscreen Photo Frame
genre                                         Lifestyle
size_bytes                                         3.0M
price                                                 0
rating_count                                         19
rating                                              1.9
Name: 10472, dtype: object

In [30]:
try:
    google_crop['price'] = google_crop['price'].str.replace('$', '', regex=False).astype(float)
    print("Removal of '$' succeded.")
except:
    print("There's some price value which is not a price")

Removal of '$' succeded.


In [31]:
google_crop.dtypes

app_name         object
genre            object
size_bytes       object
price           float64
rating_count     object
rating          float64
dtype: object

In [32]:
google_free = google_crop.copy()[google_crop['price'] == 0]
google_free.isna().describe()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
count,10041,10041,10041,10041,10041,10041
unique,1,1,1,1,1,2
top,False,False,False,False,False,False
freq,10041,10041,10041,10041,10041,8720


In [33]:
google_free['rating'].isna().value_counts()

False    8720
True     1321
Name: rating, dtype: int64

In [34]:
google_free['rating_count'] = google_free['rating_count'].astype(int)
google_free.loc[google_free['rating'].isna(), "rating_count"].value_counts(normalize=False, dropna=False).sort_index()

0       520
1       183
2       118
3        85
4        56
       ... 
1317      1
1330      1
2221      1
2536      1
3248      1
Name: rating_count, Length: 98, dtype: int64

In [35]:
google_free[(google_free['rating'].isna()) & (google_free['rating_count'] != 0)].head()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
23,Mcqueen Coloring pages,Art & Design;Action & Adventure,7.0M,0.0,61,
113,Wrinkles and rejuvenation,Beauty,5.7M,0.0,182,
123,Manicure - nail design,Beauty,3.7M,0.0,119,
126,Skin Care and Natural Beauty,Beauty,7.4M,0.0,654,
129,"Secrets of beauty, youth and health",Beauty,2.9M,0.0,77,


In [36]:
google_free.loc[(google_free['rating'].isna()) & (google_free['rating_count'] != 0), "genre"].value_counts()

Business                           77
Entertainment                      63
Tools                              59
Education                          47
Personalization                    41
Sports                             41
Medical                            35
Lifestyle                          35
News & Magazines                   34
Productivity                       29
Finance                            28
Communication                      28
Social                             24
Books & Reference                  20
Dating                             19
Health & Fitness                   18
Libraries & Demo                   17
Shopping                           16
Travel & Local                     16
Puzzle                             15
Events                             13
Video Players & Editors            11
Photography                        10
Beauty                              9
Food & Drink                        9
Maps & Navigation                   9
Trivia      

In [37]:
google_free.duplicated("app_name", keep=False).value_counts()

False    8150
True     1891
dtype: int64

In [38]:
google_free[google_free.duplicated("app_name", keep=False)].sort_values("app_name").head(20)

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
1393,10 Best Foods for You,Health & Fitness,3.8M,0.0,2490,4.0
1407,10 Best Foods for You,Health & Fitness,3.8M,0.0,2490,4.0
2543,1800 Contacts - Lens Store,Medical,26M,0.0,23160,4.7
2322,1800 Contacts - Lens Store,Medical,26M,0.0,23160,4.7
1434,21-Day Meditation Experience,Health & Fitness,15M,0.0,11506,4.4
1337,21-Day Meditation Experience,Health & Fitness,15M,0.0,11506,4.4
3083,365Scores - Live Scores,Sports,25M,0.0,666521,4.6
5415,365Scores - Live Scores,Sports,25M,0.0,666246,4.6
2522,420 BZ Budeze Delivery,Medical,11M,0.0,2,5.0
7035,420 BZ Budeze Delivery,Medical,11M,0.0,2,5.0


In [39]:
google_free.sort_values("rating_count", ascending=False, inplace=True)
google_unique = google_free.drop_duplicates(subset="app_name", keep="first", inplace=False, ignore_index=True)
google_unique.duplicated("app_name", keep=False).value_counts()

False    8906
dtype: int64

In [40]:
google_unique['rating'].isna().value_counts()

False    7595
True     1311
Name: rating, dtype: int64

In [41]:
google_unique[(google_unique['rating'].isna()) & (google_unique['rating_count'] != 0)].head()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
3924,We learn children's verses in kindergarten,Parenting;Education,6.5M,0.0,3248,
4063,Adivina el Emoji,Trivia,32M,0.0,2536,
4152,Young Speeches,Libraries & Demo,2.4M,0.0,2221,
4461,【Ranobbe complete free】 Novelba - Free app tha...,Comics,22M,0.0,1330,
4470,Em Fuga Brasil,Simulation,60M,0.0,1317,


## Before moving on with Google Play Store cleaning, we need to fix the genres column which in some instances show two genres

In [42]:
google_unique['genre'].value_counts()

Tools                                 750
Entertainment                         542
Education                             480
Business                              408
Lifestyle                             350
                                     ... 
Role Playing;Brain Games                1
Video Players & Editors;Creativity      1
Adventure;Education                     1
Card;Action & Adventure                 1
Trivia;Education                        1
Name: genre, Length: 114, dtype: int64

In [44]:
google_unique.head()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
0,Facebook,Social,Varies with device,0.0,78158306,4.1
1,WhatsApp Messenger,Communication,Varies with device,0.0,69119316,4.4
2,Instagram,Social,Varies with device,0.0,66577446,4.5
3,Messenger – Text and Video Chat for Free,Communication,Varies with device,0.0,56646578,4.0
4,Clash of Clans,Strategy,98M,0.0,44893888,4.6


In [48]:
google_unique['genre_1'] = google_unique['genre'].str.split(pat=';').str[0]
google_unique['genre_2'] = google_unique['genre'].str.split(pat=';').str[1]
google_unique.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating,genre_1,genre_2
0,Facebook,Social,Varies with device,0.0,78158306,4.1,Social,
1,WhatsApp Messenger,Communication,Varies with device,0.0,69119316,4.4,Communication,
2,Instagram,Social,Varies with device,0.0,66577446,4.5,Social,
3,Messenger – Text and Video Chat for Free,Communication,Varies with device,0.0,56646578,4.0,Communication,
4,Clash of Clans,Strategy,98M,0.0,44893888,4.6,Strategy,


In [50]:
google_unique[~google_unique['genre_2'].isna()]

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating,genre_1,genre_2
27,Minion Rush: Despicable Me Official Game,Casual;Action & Adventure,Varies with device,0.0,10216997,4.5,Casual,Action & Adventure
56,Duolingo: Learn Languages Free,Education;Education,Varies with device,0.0,6297590,4.7,Education,Education
99,ROBLOX,Adventure;Action & Adventure,67M,0.0,4450890,4.5,Adventure,Action & Adventure
225,Frozen Free Fall,Puzzle;Action & Adventure,37M,0.0,1574546,4.3,Puzzle,Action & Adventure
247,Madden NFL Football,Sports;Action & Adventure,Varies with device,0.0,1455952,4.5,Sports,Action & Adventure
...,...,...,...,...,...,...,...,...
7378,Bu Hangi Firma?,Trivia;Education,26M,0.0,8,,Trivia,Education
7998,EF Spelling Bee,Education;Education,9.4M,0.0,2,3.0,Education,Education
8340,Wowkwis aq Ka'qaquj,Education;Education,49M,0.0,1,5.0,Education,Education
8436,Ag Across America,Educational;Education,15M,0.0,0,,Educational,Education


In [51]:
google_unique.loc[~google_unique['genre_2'].isna(), 'genre_1'].unique()

array(['Casual', 'Education', 'Adventure', 'Puzzle', 'Sports', 'Racing',
       'Educational', 'Card', 'Simulation', 'Role Playing', 'Action',
       'Arcade', 'Entertainment', 'Tools', 'Video Players & Editors',
       'Lifestyle', 'Strategy', 'Board', 'Music', 'Health & Fitness',
       'Art & Design', 'Parenting', 'Communication', 'Music & Audio',
       'Travel & Local', 'Comics', 'Books & Reference', 'Trivia'],
      dtype=object)

In [52]:
google_unique.loc[google_unique['genre_1'] == 'Educational', 'genre_1'] = 'Education'
google_unique.loc[google_unique['genre_2'] == 'Educational', 'genre_2'] = 'Education'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [53]:
google_unique.loc[google_unique['genre_1'] == google_unique['genre_2'], 'genre_2'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [64]:
google_unique[~google_unique['genre_2'].isna()]

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating,genre_1,genre_2
27,Minion Rush: Despicable Me Official Game,Casual;Action & Adventure,Varies with device,0.0,10216997,4.5,Casual,Action & Adventure
99,ROBLOX,Adventure;Action & Adventure,67M,0.0,4450890,4.5,Adventure,Action & Adventure
225,Frozen Free Fall,Puzzle;Action & Adventure,37M,0.0,1574546,4.3,Puzzle,Action & Adventure
247,Madden NFL Football,Sports;Action & Adventure,Varies with device,0.0,1455952,4.5,Sports,Action & Adventure
261,Where's My Water? Free,Puzzle;Brain Games,57M,0.0,1372013,4.4,Puzzle,Brain Games
...,...,...,...,...,...,...,...,...
6498,Children's Stories 2018 - The Lion Come Lion,Parenting;Music & Video,18M,0.0,39,4.4,Parenting,Music & Video
6640,Learn the letters and words,Parenting;Education,85M,0.0,31,,Parenting,Education
7054,Tricky Bike Stunt Rider DX,Simulation;Education,45M,0.0,16,4.4,Simulation,Education
7378,Bu Hangi Firma?,Trivia;Education,26M,0.0,8,,Trivia,Education


In [61]:
google_unique.dtypes

app_name         object
genre            object
size_bytes       object
price           float64
rating_count      int32
rating          float64
genre_1          object
genre_2          object
dtype: object

In [65]:
google_unique['size_bytes'].value_counts()

Varies with device    1157
13M                    171
12M                    170
11M                    170
14M                    162
                      ... 
454k                     1
153k                     1
785k                     1
647k                     1
904k                     1
Name: size_bytes, Length: 408, dtype: int64

Ok we are going to have to drop size_bytes -.-

In [71]:
google_final = google_unique[['app_name', 'genre_1', 'genre_2', 'rating_count', 'rating']]

## Preparing for merge

In [72]:
apple_final.head()

Unnamed: 0,app_name,genre,size_bytes,rating_count,rating
0,Facebook,Social Networking,389879808,2974676,3.5
1,Instagram,Photo & Video,113954816,2161558,4.5
2,Clash of Clans,Games,116476928,2130805,4.5
3,Temple Run,Games,65921024,1724546,4.5
4,Pandora - Music & Radio,Music,130242560,1126879,4.0


In [80]:
google_final.head()

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating
0,Facebook,Social,,78158306,4.1
1,WhatsApp Messenger,Communication,,69119316,4.4
2,Instagram,Social,,66577446,4.5
3,Messenger – Text and Video Chat for Free,Communication,,56646578,4.0
4,Clash of Clans,Strategy,,44893888,4.6


In [83]:
apple_final['genre_2'] = np.nan
apple_final.rename(mapper={'genre': 'genre_1'}, axis=1, inplace=True)
apple_final = apple_final[['app_name', 'genre_1', 'genre_2', 'rating_count', 'rating']]
apple_final.head()

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating
0,Facebook,Social Networking,,2974676,3.5
1,Instagram,Photo & Video,,2161558,4.5
2,Clash of Clans,Games,,2130805,4.5
3,Temple Run,Games,,1724546,4.5
4,Pandora - Music & Radio,Music,,1126879,4.0


In [85]:
apple_final['genre_1'].unique()

array(['Social Networking', 'Photo & Video', 'Games', 'Music',
       'Reference', 'Health & Fitness', 'Weather', 'Utilities', 'Travel',
       'Shopping', 'News', 'Navigation', 'Lifestyle', 'Entertainment',
       'Food & Drink', 'Sports', 'Book', 'Finance', 'Education',
       'Productivity', 'Business', 'Catalogs', 'Medical'], dtype=object)

In [87]:
google_final['genre_1'].unique()

array(['Social', 'Communication', 'Strategy', 'Tools', 'Arcade',
       'Video Players & Editors', 'Casual', 'Sports', 'News & Magazines',
       'Action', 'Photography', 'Adventure', 'Travel & Local', 'Racing',
       'Personalization', 'Maps & Navigation', 'Entertainment', 'Trivia',
       'Education', 'Shopping', 'Productivity', 'Health & Fitness',
       'Simulation', 'Books & Reference', 'Lifestyle', 'Role Playing',
       'Weather', 'Puzzle', 'Casino', 'Card', 'Word', 'Board', 'Finance',
       'Business', 'Food & Drink', 'Comics', 'Music', 'Parenting',
       'Dating', 'House & Home', 'Libraries & Demo', 'Art & Design',
       'Auto & Vehicles', 'Medical', 'Beauty', 'Events', 'Music & Audio'],
      dtype=object)

In [158]:
genres_map = {
    'Social Networking':['Social', 'Communication', 'Dating'],
    'Photo & Video':['Video Players & Editors', 'Photography', 'Art & Design', 'Beauty'],
    'Games':['Strategy', 'Arcade', 'Action', 'Adventure', 'Racing', 'Trivia', 'Simulation', 'Role Playing', 'Puzzle', 'Casino', 'Card', 'Word', 'Board', 'Casual'],
    'Music':['Music', 'Music & Audio'],
    'Reference':['Libraries & Demo'],
    'Health & Fitness':['Health & Fitness'],
    'Weather':['Weather'],
    'Utilities':['Tools', 'Parenting', 'Auto & Vehicles'],
    'Travel':['Travel & Local'],
    'Shopping':['Shopping'],
    'News':['News & Magazines'],
    'Navigation':['Maps & Navigation'],
    'Lifestyle':['Lifestyle'],
    'Entertainment':['Entertainment', 'Personalization', 'Events'],
    'Food & Drink':['Food & Drink'],
    'Sports':['Sports'],
    'Book':['Books & Reference', 'Comics'],
    'Finance':['Finance'],
    'Education':['Education'],
    'Productivity':['Productivity'],
    'Business':['Business'],
    'Catalogs':['House & Home'],
    'Medical':['Medical']
}


In [159]:
google_final[google_final['genre_1'] == 'Events'].head(10)

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating
2117,Ticketmaster Event Tickets,Events,,40113,4.0
2484,"StubHub - Tickets to Sports, Concerts & Events",Events,,26089,4.0
2683,Fever,Events,,20611,4.0
2886,"SeatGeek – Tickets to Sports, Concerts, Broadway",Events,,15558,4.4
3265,"Gametime - Tickets to Sports, Concerts, Theater",Events,,8800,4.5
3310,Vivid Seats – Event Tickets,Events,,8232,4.6
3415,Reminder,Events,,7074,4.5
3554,vide-greniers.org,Events,,5839,4.3
3730,KudaGo - things to do in NY,Events,,4298,4.4
3796,LBB - Find New & Unique Things To Do Around You,Events,,3874,4.6


In [160]:
apple_final[apple_final['app_name'].str.contains(pat='tickets', case=False, regex=False)].head(20)

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating
54,Fandango Movies - Times + Tickets,Entertainment,,291787,4.0
832,"StubHub - Tickets to Sports, Concerts and Theatre",Entertainment,,9011,3.5
971,"SeatGeek – Tickets to Sports, Concerts & Broadway",Entertainment,,6088,4.5
1017,"Ticketmaster - Tickets for Concerts, Sports, S...",Entertainment,,5356,3.0
1123,Atom – Movie Tickets and Showtimes,Entertainment,,4105,5.0
2267,Voyages-sncf.com : book train and bus tickets,Travel,,268,3.5
2293,"Trainline UK: Live Train Times, Tickets & Planner",Travel,,248,4.0
3820,skyticket - Reserve Best Valued Air Tickets,Travel,,0,0.0


In [161]:
apple_final[apple_final['genre_1'] == 'Reference'].head(20)

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating
6,Bible,Reference,,985920,4.5
80,Dictionary.com Dictionary & Thesaurus,Reference,,200047,4.0
304,Dictionary.com Dictionary & Thesaurus for iPad,Reference,,54175,4.5
474,Google Translate,Reference,,26786,3.5
597,"Muslim Pro: Ramadan 2017 Prayer Times, Azan, Q...",Reference,,18418,4.5
612,New Furniture Mods - Pocket Wiki & Game Tools ...,Reference,,17588,4.5
628,Merriam-Webster Dictionary,Reference,,16849,4.5
733,Night Sky,Reference,,12122,4.5
851,City Maps for Minecraft PE - The Best Maps for...,Reference,,8535,4.0
1067,LUCKY BLOCK MOD ™ for Minecraft PC Edition - T...,Reference,,4693,4.0


In [177]:
google_final[google_final['app_name'].str.contains(pat='Faceswap', case=False, regex=False)].head(20)

Unnamed: 0,app_name,genre_1,genre_2,rating_count,rating


In [182]:
genres_map_inv = {}
for key in genres_map: 
    # Go through the list that is saved in the dict:
    for item in genres_map[key]:
        # Check if in the inverted dict the key exists
        if item not in genres_map_inv: 
            # If not create a new list
            genres_map_inv[item] = key
        else: 
            print(item, key, 'has failed.')

for k in genres_map_inv:
    print(k, '\t', genres_map_inv[k])

Social 	 Social Networking
Communication 	 Social Networking
Dating 	 Social Networking
Video Players & Editors 	 Photo & Video
Photography 	 Photo & Video
Art & Design 	 Photo & Video
Beauty 	 Photo & Video
Strategy 	 Games
Arcade 	 Games
Action 	 Games
Adventure 	 Games
Racing 	 Games
Trivia 	 Games
Simulation 	 Games
Role Playing 	 Games
Puzzle 	 Games
Casino 	 Games
Card 	 Games
Word 	 Games
Board 	 Games
Casual 	 Games
Music 	 Music
Music & Audio 	 Music
Libraries & Demo 	 Reference
Health & Fitness 	 Health & Fitness
Weather 	 Weather
Tools 	 Utilities
Parenting 	 Utilities
Auto & Vehicles 	 Utilities
Travel & Local 	 Travel
Shopping 	 Shopping
News & Magazines 	 News
Maps & Navigation 	 Navigation
Lifestyle 	 Lifestyle
Entertainment 	 Entertainment
Personalization 	 Entertainment
Events 	 Entertainment
Food & Drink 	 Food & Drink
Sports 	 Sports
Books & Reference 	 Book
Comics 	 Book
Finance 	 Finance
Education 	 Education
Productivity 	 Productivity
Business 	 Business
House & 