#  Title
## Subtitle

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
apple_store = pd.read_csv("AppleStore.csv")
google_store = pd.read_csv("googleplaystore.csv")

## Apple overview

In [32]:
print(apple_store.head())

          id               track_name  size_bytes currency  price  \
0  284882215                 Facebook   389879808      USD    0.0   
1  389801252                Instagram   113954816      USD    0.0   
2  529479190           Clash of Clans   116476928      USD    0.0   
3  420009108               Temple Run    65921024      USD    0.0   
4  284035177  Pandora - Music & Radio   130242560      USD    0.0   

   rating_count_tot  rating_count_ver  user_rating  user_rating_ver      ver  \
0           2974676               212          3.5              3.5     95.0   
1           2161558              1289          4.5              4.0    10.23   
2           2130805               579          4.5              4.5  9.24.12   
3           1724546              3842          4.5              4.0    1.6.2   
4           1126879              3594          4.0              4.5    8.4.1   

  cont_rating        prime_genre  sup_devices.num  ipadSc_urls.num  lang.num  \
0          4+  Social Ne

In [33]:
print(apple_store.describe())

                 id    size_bytes        price  rating_count_tot  \
count  7.197000e+03  7.197000e+03  7197.000000      7.197000e+03   
mean   8.631310e+08  1.991345e+08     1.726218      1.289291e+04   
std    2.712368e+08  3.592069e+08     5.833006      7.573941e+04   
min    2.816565e+08  5.898240e+05     0.000000      0.000000e+00   
25%    6.000937e+08  4.692275e+07     0.000000      2.800000e+01   
50%    9.781482e+08  9.715302e+07     0.000000      3.000000e+02   
75%    1.082310e+09  1.819249e+08     1.990000      2.793000e+03   
max    1.188376e+09  4.025970e+09   299.990000      2.974676e+06   

       rating_count_ver  user_rating  user_rating_ver  sup_devices.num  \
count       7197.000000  7197.000000      7197.000000      7197.000000   
mean         460.373906     3.526956         3.253578        37.361817   
std         3920.455183     1.517948         1.809363         3.737715   
min            0.000000     0.000000         0.000000         9.000000   
25%            1.

## Google overview

In [34]:
print(google_store.head())

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

In [35]:
print(google_store.describe())

            Rating
count  9367.000000
mean      4.193338
std       0.537431
min       1.000000
25%       4.000000
50%       4.300000
75%       4.500000
max      19.000000


## Comparing columns

In [36]:
print(apple_store.columns)

Index(['id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')


In [37]:
print(google_store.columns)

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


The analysis is going to focus on free apps, and the end of the cleaning process is going to be a merge of these two datasets. So based on the column analysis these are the key <span style="color:green">**properties to keep**</span> in order to have a consistent merged file:
- Origin (Apple or Google)
- App Name
- App Size (bytes)
- Count of total Ratings
- Ratings (total)
- Genre/Prime Genre
- Price

Here's the explanation of why other properties will be <span style="color:red">**dropped**</span>:
- **Category** - That is only present in Google Play Store and its a broader description than Genres, so I keep the last one to be more specific.
- **Installs** - It would be lovely to have this data from Apple Store, but as I only have it for Google, we'll count the users through the ratings which are present in both.
- **Related to Versions** - Apple is more specific in splitting data from previous versions, to current version. But as our objective is to merge, we will just use the Total property.
- **Related to Devices/Softwares** - Although both relatively have info about it, the conclusions we aim are not related to anything technical, so this will be discarted.

## Cleaning Apple Store data

In [38]:
apple_crop = apple_store.copy()[["id", "track_name", "prime_genre", "size_bytes", "price", "rating_count_tot", "user_rating"]]
apple_crop.head()

Unnamed: 0,id,track_name,prime_genre,size_bytes,price,rating_count_tot,user_rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [39]:
apple_crop.columns = ["id", "app_name", "genre", "size_bytes", "price", "rating_count", "rating"]
apple_crop.head()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [40]:
apple_crop["price"].value_counts()

0.00      4056
0.99       728
2.99       683
1.99       621
4.99       394
3.99       277
6.99       166
9.99        81
5.99        52
7.99        33
14.99       21
19.99       13
8.99         9
24.99        8
11.99        6
29.99        6
13.99        6
12.99        5
15.99        4
17.99        3
59.99        3
27.99        2
23.99        2
22.99        2
20.99        2
49.99        2
16.99        2
39.99        2
249.99       1
74.99        1
18.99        1
99.99        1
21.99        1
34.99        1
299.99       1
47.99        1
Name: price, dtype: int64

In [41]:
apple_free = apple_crop.copy()[apple_crop['price'] == 0]
print(apple_free.shape)
apple_free.head()

(4056, 7)


Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [42]:
apple_free.describe()

Unnamed: 0,id,size_bytes,price,rating_count,rating
count,4056.0,4056.0,4056.0,4056.0,4056.0
mean,899164700.0,147935700.0,0.0,19749.8,3.376726
std,258370900.0,208901400.0,0.0,97744.28,1.644807
min,281796100.0,767126.0,0.0,0.0,0.0
25%,700890800.0,54041340.0,0.0,22.0,3.0
50%,1014135000.0,99600380.0,0.0,466.0,4.0
75%,1091131000.0,161198600.0,0.0,5450.75,4.5
max,1188376000.0,3148421000.0,0.0,2974676.0,5.0


In [43]:
apple_free.isna().describe()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
count,4056,4056,4056,4056,4056,4056,4056
unique,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False
freq,4056,4056,4056,4056,4056,4056,4056


In [44]:
apple_free.duplicated("app_name", keep=False).value_counts()

False    4052
True        4
dtype: int64

In [45]:
apple_free[apple_free.duplicated("app_name", keep=False)]

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
2948,1173990889,Mannequin Challenge,Games,109705216,0.0,668,3.0
4442,952877179,VR Roller Coaster,Games,169523200,0.0,107,3.5
4463,1178454060,Mannequin Challenge,Games,59572224,0.0,105,4.0
4831,1089824278,VR Roller Coaster,Games,240964608,0.0,67,3.5


In [46]:
apple_free.sort_values("rating_count", ascending=False, inplace=True)
apple_unique = apple_free.drop_duplicates(subset="app_name", keep="first", inplace=False, ignore_index=True)
apple_unique.duplicated("app_name", keep=False).value_counts()

False    4054
dtype: int64

In [47]:
apple_unique[apple_unique['app_name'] == "VR Roller Coaster"]

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
2625,952877179,VR Roller Coaster,Games,169523200,0.0,107,3.5


In [48]:
apple_unique.head()

Unnamed: 0,id,app_name,genre,size_bytes,price,rating_count,rating
0,284882215,Facebook,Social Networking,389879808,0.0,2974676,3.5
1,389801252,Instagram,Photo & Video,113954816,0.0,2161558,4.5
2,529479190,Clash of Clans,Games,116476928,0.0,2130805,4.5
3,420009108,Temple Run,Games,65921024,0.0,1724546,4.5
4,284035177,Pandora - Music & Radio,Music,130242560,0.0,1126879,4.0


In [49]:
apple_final = apple_unique.copy()[["app_name", "genre", "size_bytes", "rating_count", "rating"]]
apple_final.head()

Unnamed: 0,app_name,genre,size_bytes,rating_count,rating
0,Facebook,Social Networking,389879808,2974676,3.5
1,Instagram,Photo & Video,113954816,2161558,4.5
2,Clash of Clans,Games,116476928,2130805,4.5
3,Temple Run,Games,65921024,1724546,4.5
4,Pandora - Music & Radio,Music,130242560,1126879,4.0


## Cleaning Google Play Store data

In [50]:
google_crop = google_store.copy()[["App", "Genres", "Size", "Price", "Reviews", "Rating"]]
google_crop.head()

Unnamed: 0,App,Genres,Size,Price,Reviews,Rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19M,0,159,4.1
1,Coloring book moana,Art & Design;Pretend Play,14M,0,967,3.9
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8.7M,0,87510,4.7
3,Sketch - Draw & Paint,Art & Design,25M,0,215644,4.5
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2.8M,0,967,4.3


In [51]:
google_crop.columns = ["app_name", "genre", "size_bytes", "price", "rating_count", "rating"]
google_crop.head()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design,19M,0,159,4.1
1,Coloring book moana,Art & Design;Pretend Play,14M,0,967,3.9
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design,8.7M,0,87510,4.7
3,Sketch - Draw & Paint,Art & Design,25M,0,215644,4.5
4,Pixel Draw - Number Art Coloring Book,Art & Design;Creativity,2.8M,0,967,4.3


In [52]:
google_crop['price'].value_counts()

0         10040
$0.99       148
$2.99       129
$1.99        73
$4.99        72
          ...  
$1.75         1
$14.00        1
$4.85         1
$46.99        1
$1.04         1
Name: price, Length: 93, dtype: int64

In [53]:
try:
    google_crop['price'] = google_crop['price'].str.replace('$', '').astype(float)
except:
    print("There's some price value which is not a price")

There's some price value which is not a price


  google_crop['price'] = google_crop['price'].str.replace('$', '').astype(float)


In [54]:
google_crop.loc[~google_crop['price'].str.startswith("$"), 'price'].value_counts()

0           10040
Everyone        1
Name: price, dtype: int64

In [55]:
google_crop[google_crop['price'] == 'Everyone']

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
10472,Life Made WI-Fi Touchscreen Photo Frame,"February 11, 2018","1,000+",Everyone,3.0M,19.0


In [56]:
google_store.iloc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                              1.9
Rating                                               19.0
Reviews                                              3.0M
Size                                               1,000+
Installs                                             Free
Type                                                    0
Price                                            Everyone
Content Rating                                        NaN
Genres                                  February 11, 2018
Last Updated                                       1.0.19
Current Ver                                    4.0 and up
Android Ver                                           NaN
Name: 10472, dtype: object

In [58]:
google_crop.dtypes

app_name         object
genre            object
size_bytes       object
price            object
rating_count     object
rating          float64
dtype: object

Add photo of Google Play Store

In [60]:
google_crop.loc[10472, "genre"] = "Lifestyle"
google_crop.loc[10472, "size_bytes"] = "3.0M"
google_crop.loc[10472, "price"] = "0"
google_crop.loc[10472, "rating_count"] = "19"
google_crop.loc[10472, "rating"] = 1.9
google_crop.loc[10472]

app_name        Life Made WI-Fi Touchscreen Photo Frame
genre                                         Lifestyle
size_bytes                                         3.0M
price                                                 0
rating_count                                         19
rating                                              1.9
Name: 10472, dtype: object

In [61]:
try:
    google_crop['price'] = google_crop['price'].str.replace('$', '').astype(float)
except:
    print("There's some price value which is not a price")

  google_crop['price'] = google_crop['price'].str.replace('$', '').astype(float)


In [62]:
google_crop.isna().describe()

Unnamed: 0,app_name,genre,size_bytes,price,rating_count,rating
count,10841,10841,10841,10841,10841,10841
unique,1,1,1,1,1,2
top,False,False,False,False,False,False
freq,10841,10841,10841,10841,10841,9367


In [66]:
google_crop['rating'].isna().value_counts()

False    9367
True     1474
Name: rating, dtype: int64

next steps:

make a graph to find out where are the ratings NaN (genre)
sum the rating count for the NaN ratings
take a decision of what to do with those 1.4k values