In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np
import seaborn as sns
import datetime as dt
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest


In [75]:
df = pd.read_csv("googleplaystore.csv")
df.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2337,Delta Dental,MEDICAL,3.0,914,Varies with device,"100,000+",Free,0,Everyone,Medical,"July 17, 2018",4.6.0,5.0 and up
8939,DV-4036 by Somikon,PHOTOGRAPHY,,17,44M,"1,000+",Free,0,Everyone,Photography,"January 19, 2017",0.0.5,4.3 and up
7640,Supermarket Cashier Kids Games,FAMILY,3.6,32416,Varies with device,"1,000,000+",Free,0,Everyone,Educational,"July 10, 2017",Varies with device,Varies with device
628,Video chat live advices,DATING,,0,8.0M,100+,Free,0,Everyone,Dating,"July 10, 2018",1.0,3.0 and up
9900,Schengen/EU App,TOOLS,4.0,24,11M,"1,000+",Free,0,Everyone,Tools,"February 27, 2018",1.0,4.1 and up
3113,Hostelworld: Hostels & Cheap Hotels Travel App,TRAVEL_AND_LOCAL,4.4,17878,28M,"1,000,000+",Free,0,Everyone,Travel & Local,"July 16, 2018",6.7.1,4.1 and up
5583,The Aether: Life as a God,FAMILY,4.1,1407,3.4M,"100,000+",Free,0,Everyone 10+,Role Playing,"March 12, 2018",1.0.5,4.0 and up
6384,BK Dinos,FAMILY,4.4,30,99M,"5,000+",Free,0,Everyone,Entertainment,"March 31, 2018",0.1,4.0 and up
3944,Four In A Line,GAME,3.8,22191,3.0M,"1,000,000+",Free,0,Everyone,Board,"May 4, 2015",1.10,2.3 and up
5074,Gun Strike Shoot,GAME,4.1,94761,17M,"10,000,000+",Free,0,Teen,Action,"February 21, 2017",1.1.4,2.3 and up


# 1. Data Cleaning

## 1.1 Data Investigation

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


### Notes
- Should check for duplicates
- should change into float
    - Reviews
    - Size
    - installs
    - Price
- Ordinal Data
    - Content Rating
    - Android Ver

- Nomial Data
    - Category
    - Type
    - Genres
    - Current Ver

- Date Time
    - Last Updated


In [77]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [78]:
df.shape

(10841, 13)

In [79]:
df.duplicated().sum()

np.int64(483)

In [80]:
df[df.duplicated()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
229,Quick PDF Scanner + OCR FREE,BUSINESS,4.2,80805,Varies with device,"5,000,000+",Free,0,Everyone,Business,"February 26, 2018",Varies with device,4.0.3 and up
236,Box,BUSINESS,4.2,159872,Varies with device,"10,000,000+",Free,0,Everyone,Business,"July 31, 2018",Varies with device,Varies with device
239,Google My Business,BUSINESS,4.4,70991,Varies with device,"5,000,000+",Free,0,Everyone,Business,"July 24, 2018",2.19.0.204537701,4.4 and up
256,ZOOM Cloud Meetings,BUSINESS,4.4,31614,37M,"10,000,000+",Free,0,Everyone,Business,"July 20, 2018",4.1.28165.0716,4.0 and up
261,join.me - Simple Meetings,BUSINESS,4.0,6989,Varies with device,"1,000,000+",Free,0,Everyone,Business,"July 16, 2018",4.3.0.508,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8643,Wunderlist: To-Do List & Tasks,PRODUCTIVITY,4.6,404610,Varies with device,"10,000,000+",Free,0,Everyone,Productivity,"April 6, 2018",Varies with device,Varies with device
8654,"TickTick: To Do List with Reminder, Day Planner",PRODUCTIVITY,4.6,25370,Varies with device,"1,000,000+",Free,0,Everyone,Productivity,"August 6, 2018",Varies with device,Varies with device
8658,ColorNote Notepad Notes,PRODUCTIVITY,4.6,2401017,Varies with device,"100,000,000+",Free,0,Everyone,Productivity,"June 27, 2018",Varies with device,Varies with device
10049,Airway Ex - Intubate. Anesthetize. Train.,MEDICAL,4.3,123,86M,"10,000+",Free,0,Everyone,Medical,"June 1, 2018",0.6.88,5.0 and up


In [81]:
df[df["App"].isin(["AAFP", "Box", "Google My Business"])]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
193,Google My Business,BUSINESS,4.4,70991,Varies with device,"5,000,000+",Free,0,Everyone,Business,"July 24, 2018",2.19.0.204537701,4.4 and up
204,Box,BUSINESS,4.2,159872,Varies with device,"10,000,000+",Free,0,Everyone,Business,"July 31, 2018",Varies with device,Varies with device
236,Box,BUSINESS,4.2,159872,Varies with device,"10,000,000+",Free,0,Everyone,Business,"July 31, 2018",Varies with device,Varies with device
239,Google My Business,BUSINESS,4.4,70991,Varies with device,"5,000,000+",Free,0,Everyone,Business,"July 24, 2018",2.19.0.204537701,4.4 and up
265,Box,BUSINESS,4.2,159872,Varies with device,"10,000,000+",Free,0,Everyone,Business,"July 31, 2018",Varies with device,Varies with device
268,Google My Business,BUSINESS,4.4,70991,Varies with device,"5,000,000+",Free,0,Everyone,Business,"July 24, 2018",2.19.0.204537701,4.4 and up
2515,AAFP,MEDICAL,3.8,63,24M,"10,000+",Free,0,Everyone,Medical,"June 22, 2018",2.3.1,5.0 and up
10768,AAFP,MEDICAL,3.8,63,24M,"10,000+",Free,0,Everyone,Medical,"June 22, 2018",2.3.1,5.0 and up


## 1.2 Working with Data Issues

### Dropping duplicates

In [82]:
df.drop_duplicates(inplace = True)
df.duplicated().sum()

np.int64(0)

In [83]:
df.shape

(10358, 13)

### Changing the following into floats, making splits as necssary:
    - Reviews
    - Size
    - installs
    - Price

In [84]:
def remove_plus (text):
    if text[-1] == "+":
        return text[:-1]
    else:
        return text

df["Installs"] = df["Installs"].apply(remove_plus)
df.sample(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
6908,BW COMPANY FINDER,BUSINESS,4.6,48,7.4M,1000,Free,0,Everyone,Business,"August 21, 2017",1.5,4.0 and up
4553,R File Manager,TOOLS,4.8,17,6.7M,50,Free,0,Everyone,Tools,"May 5, 2018",v2,4.4 and up
10706,Neon Blue Gaming Wallpaper&Theme fo Lenovo K8 ...,BUSINESS,4.6,7,2.0M,500,Free,0,Everyone,Business,"August 24, 2017",1.0.0,2.3.3 and up


In [85]:
def split_size (text):
    if text[-1] in ["M", "k"]:
        return text[:-1], text[-1]
    else:
        return pd.NA, text

df[["Ts", "Tu"]] = df["Size"].apply(split_size).apply(pd.Series)
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Ts,Tu
4397,Guns'n'Glory Premium,FAMILY,4.4,3570,21M,100000,Paid,$2.99,Everyone 10+,Strategy,"September 22, 2017",1.8.1,2.3.3 and up,21.0,M
1236,Foursquare City Guide,FOOD_AND_DRINK,4.1,483960,Varies with device,10000000,Free,0,Teen,Food & Drink,"July 23, 2018",Varies with device,Varies with device,,Varies with device
8012,Morse Machine for Ham Radio,COMMUNICATION,4.8,341,9.6M,5000,Paid,$0.99,Everyone,Communication,"February 20, 2015",iu4,2.0 and up,9.6,M
9645,EO.TRADE - Coin sale,FINANCE,4.3,95,1.5M,10000,Free,0,Everyone,Finance,"June 5, 2018",0.3,4.0.3 and up,1.5,M
4177,G-NetReport Pro,TOOLS,,0,1.6M,10,Paid,$25.99,Everyone,Tools,"June 14, 2018",5.0,4.0 and up,1.6,M


In [86]:
df["Tu"][:-1].unique()

array(['M', 'Varies with device', 'k', '1,000+'], dtype=object)

In [87]:
df["Tu"][:-1].value_counts()

Tu
M                     8515
Varies with device    1526
k                      315
1,000+                   1
Name: count, dtype: int64

In [88]:
df[df["Tu"] == "1,000+"]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Ts,Tu
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,,,"1,000+"


Fixing rowid = 10472
can be dropped, but I will change the fix the cell values for practice

In [89]:
df.loc[10472, "Android Ver"] = df.loc[10472, "Current Ver"]
df.loc[10472, "Current Ver"] = df.loc[10472, "Last Updated"]
df.loc[10472, "Last Updated"] = df.loc[10472, "Genres"]
df.loc[10472, "Genres"] = df.loc[10472, "Content Rating"]
df.loc[10472, "Content Rating"] = df.loc[10472, "Price"]
df.loc[10472, "Price"] = df.loc[10472, "Type"]
df.loc[10472, "Type"] = df.loc[10472, "Installs"]
df.loc[10472, "Installs"] = df.loc[10472, "Size"]
df.loc[10472, "Size"] = df.loc[10472, "Reviews"]
df.loc[10472, "Reviews"] = df.loc[10472, "Rating"]
df.loc[10472, "Rating"] = df.loc[10472, "Category"]
df.loc[10472, "Category"] = 'PHOTOGRAPHY'

df.loc[10472]


  df.loc[10472, "Rating"] = df.loc[10472, "Category"]


App               Life Made WI-Fi Touchscreen Photo Frame
Category                                      PHOTOGRAPHY
Rating                                                1.9
Reviews                                              19.0
Size                                                 3.0M
Installs                                           1,000+
Type                                                 Free
Price                                                   0
Content Rating                                   Everyone
Genres                                                NaN
Last Updated                            February 11, 2018
Current Ver                                        1.0.19
Android Ver                                    4.0 and up
Ts                                                   <NA>
Tu                                                 1,000+
Name: 10472, dtype: object

In [90]:
df.loc[10472, "Installs"] = "1,000"

### Fixing Price

In [91]:
df["Price"].value_counts()

Price
0          9593
$0.99       146
$2.99       125
$1.99        73
$4.99        70
           ... 
$3.61         1
$394.99       1
$1.26         1
$1.20         1
$1.04         1
Name: count, Length: 92, dtype: int64

In [92]:
def split_price (text):
    if text[0] == "$":
        return text[1:], text[0]
    elif text == "0":
        return "0", "$"
    else:
        return pd.NA, text

df[["Ps", "Pu"]] = df["Price"].apply(split_price).apply(pd.Series)
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Ts,Tu,Ps,Pu
5065,Camera Pro,PHOTOGRAPHY,4.1,7479,13M,1000000,Free,0,Everyone,Photography,"April 16, 2018",5.4,4.0 and up,13.0,M,0.0,$
3978,CARDI B WALLPAPERS,PERSONALIZATION,4.1,8,4.5M,1000,Free,0,Everyone,Personalization,"May 21, 2018",3.1,4.0.3 and up,4.5,M,0.0,$
6602,Wireless BP,MEDICAL,1.8,30,887k,500,Free,0,Everyone,Medical,"May 27, 2016",1.1,4.3 and up,887.0,k,0.0,$
7364,Pixxy KWGT,PERSONALIZATION,4.8,60,78M,1000,Paid,$0.99,Everyone,Personalization,"July 29, 2018",4.0.1,5.0 and up,78.0,M,0.99,$
10574,Lottery Results: Florida,FAMILY,4.2,582,3.2M,100000,Free,0,Teen,Entertainment,"January 22, 2018",4.0,4.0 and up,3.2,M,0.0,$


In [93]:
df["Pu"].value_counts()

Pu
$    10358
Name: count, dtype: int64

In [94]:
df[["Size", "Tu"]] = df["Size"].apply(split_size).apply(pd.Series)
df[["Price", "Pu"]] = df["Price"].apply(split_price).apply(pd.Series)


In [95]:
columns = ["Size", "Price", "Reviews", "Rating"]


for column in columns:
    df[column] = pd.to_numeric(df[column], errors = "coerce")

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  float64
 4   Size            8832 non-null   float64
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  float64
 8   Content Rating  10358 non-null  object 
 9   Genres          10357 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10356 non-null  object 
 13  Ts              8831 non-null   object 
 14  Tu              10358 non-null  object 
 15  Ps              10358 non-null  object 
 16  Pu              10358 non-null  object 
dtypes: float64(4), object(13)
memory usa

In [96]:
df["Installs"] = pd.to_numeric(df["Installs"].str.replace(",",""), errors = "coerce")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  float64
 4   Size            8832 non-null   float64
 5   Installs        10358 non-null  int64  
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  float64
 8   Content Rating  10358 non-null  object 
 9   Genres          10357 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10356 non-null  object 
 13  Ts              8831 non-null   object 
 14  Tu              10358 non-null  object 
 15  Ps              10358 non-null  object 
 16  Pu              10358 non-null  object 
dtypes: float64(4), int64(1), object(12)


### Fixing Size, unifying all the units to Megas rather than kilos. If varies by device, it will be "Null"

In [97]:
# Checking that all NAN in the Size are "Varies with device"
df[df["Size"].isnull()]["Tu"].value_counts()

Tu
Varies with device    1526
Name: count, dtype: int64

In [98]:
# modifiing size in K to be in M (to unify the unit)
df.loc[df["Tu"] == "k", "Size"] = df["Size"] / 1024
df[df["Tu"]=="k"].sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Ts,Tu,Ps,Pu
9696,EP RSS Reader,COMMUNICATION,3.8,4.0,0.871094,100,Free,0.0,Everyone,Communication,"July 16, 2018",0.99,4.0.3 and up,892,k,0,$
8030,Learn Morse Code - G0HYN Learn Morse,COMMUNICATION,4.0,27.0,0.585938,5000,Free,0.0,Everyone,Communication,"May 8, 2018",5.0,2.2 and up,600,k,0,$
6515,BN Pro RobotoXL-b HD Text,LIBRARIES_AND_DEMO,4.2,86.0,0.448242,10000,Free,0.0,Everyone,Libraries & Demo,"February 5, 2017",2.3.2,1.6 and up,459,k,0,$
8312,Satellite Director,TOOLS,4.1,45610.0,0.171875,10000000,Free,0.0,Everyone,Tools,"January 2, 2018",1.56,1.6 and up,176,k,0,$
5130,Denis Brogniart - AH !,FAMILY,4.5,2931.0,0.798828,100000,Free,0.0,Everyone,Entertainment,"August 4, 2017",1.1,4.2 and up,818,k,0,$


In [99]:
df = df.drop(["Ts", "Tu", "Ps", "Pu"], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  float64
 4   Size            8832 non-null   float64
 5   Installs        10358 non-null  int64  
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  float64
 8   Content Rating  10358 non-null  object 
 9   Genres          10357 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10356 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usage: 1.4+ MB


In [100]:
df.duplicated().sum()

np.int64(0)

### Checking Non - Numeric columns
- Ordinal Data
    - Content Rating
    - Android Ver

- Nomial Data
    - Category
    - Type
    - Genres
    - Current Ver

- Date Time
    - Last Updated
    

### Fixing Time Format

In [101]:
# changing Last updated to date time
df["Last Updated"] = pd.to_datetime(df["Last Updated"], format = "%B %d, %Y", errors= "coerce")


In [102]:
df["Last Updated"]

0       2018-01-07
1       2018-01-15
2       2018-08-01
3       2018-06-08
4       2018-06-20
           ...    
10836   2017-07-25
10837   2018-07-06
10838   2017-01-20
10839   2015-01-19
10840   2018-07-25
Name: Last Updated, Length: 10358, dtype: datetime64[ns]

In [103]:
df["Content Rating"].value_counts()

Content Rating
Everyone           8383
Teen               1146
Mature 17+          447
Everyone 10+        377
Adults only 18+       3
Unrated               2
Name: count, dtype: int64

No Fixing required for "Content Rating"

In [104]:
df["Android Ver"].value_counts()

Android Ver
4.1 and up            2379
4.0.3 and up          1451
4.0 and up            1338
Varies with device    1221
4.4 and up             894
2.3 and up             643
5.0 and up             546
4.2 and up             387
2.3.3 and up           279
2.2 and up             239
3.0 and up             237
4.3 and up             235
2.1 and up             133
1.6 and up             116
6.0 and up              58
7.0 and up              42
3.2 and up              36
2.0 and up              32
5.1 and up              22
1.5 and up              20
4.4W and up             11
3.1 and up              10
2.0.1 and up             7
8.0 and up               6
7.1 and up               3
5.0 - 8.0                2
4.0.3 - 7.1.1            2
1.0 and up               2
7.0 - 7.1.1              1
4.1 - 7.1.1              1
5.0 - 6.0                1
2.2 - 7.1.1              1
5.0 - 7.1.1              1
Name: count, dtype: int64

Cleaning is required for "Android Version." It requires to be split to two columns "From" and "To"


In [105]:
df[df["Android Ver"].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4453,[substratum] Vacuum: P,PERSONALIZATION,4.4,230.0,11.0,1000,Paid,1.49,Everyone,Personalization,2018-07-20,4.4,
4490,Pi Dark [substratum],PERSONALIZATION,4.5,189.0,2.1,10000,Free,0.0,Everyone,Personalization,2018-03-27,1.1,


### Fixing Android version: splitting into two columns (Android From) and (Android To)

In [106]:
def clean_android(text):
    if not isinstance(text, str) or text.strip() == "":
        return "Varies with device", "Varies with device"

    if text.strip() == "Varies with device":
        return "Varies with device", "Varies with device"
    elif text[-2:].strip() == "up":
        return text.split(" ")[0], "Present"
    else:
        return text.split(" - ")[0], text.split(" - ")[1]
    
df[["Android From", "Android To"]] = df["Android Ver"].apply(clean_android).tolist()
df.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Android From,Android To
8411,DH-UFO,FAMILY,5.0,1.0,59.0,100,Free,0.0,Everyone,Entertainment,2018-05-24,1.0.9,4.4 and up,4.4,Present
8674,All Type DP,FAMILY,4.3,1853.0,5.8,500000,Free,0.0,Everyone,Entertainment,2018-05-10,1.8,4.0 and up,4.0,Present
9853,Amino: Communities and Chats,SOCIAL,4.8,1264084.0,62.0,10000000,Free,0.0,Teen,Social,2018-08-07,1.8.19179,4.0.3 and up,4.0.3,Present
3199,Hotwire Hotel & Car Rental App,TRAVEL_AND_LOCAL,4.3,10323.0,,1000000,Free,0.0,Everyone,Travel & Local,2018-07-19,Varies with device,Varies with device,Varies with device,Varies with device
4864,Angry Birds 2,FAMILY,4.6,3881752.0,57.0,100000000,Free,0.0,Everyone,Casual,2018-07-26,2.21.1,4.1 and up,4.1,Present
9923,CALIOPE EU: Air Quality,HEALTH_AND_FITNESS,3.9,21.0,2.2,1000,Free,0.0,Everyone,Health & Fitness,2015-10-30,1.1.2,4.0 and up,4.0,Present
6781,BT Share It,BUSINESS,4.7,12.0,13.0,500,Free,0.0,Everyone,Business,2018-05-16,3.4.2,4.4 and up,4.4,Present
10831,payermonstationnement.fr,MAPS_AND_NAVIGATION,,38.0,9.8,5000,Free,0.0,Everyone,Maps & Navigation,2018-06-13,2.0.148.0,4.0 and up,4.0,Present
1645,Relax Ocean ~ Nature Sounds,LIFESTYLE,4.5,9464.0,,500000,Free,0.0,Everyone,Lifestyle,2017-07-18,Varies with device,Varies with device,Varies with device,Varies with device
5906,Barista Coffee Dictionary A-Z,LIFESTYLE,4.3,127.0,5.5,10000,Free,0.0,Everyone,Lifestyle,2016-06-03,3.0.0,4.1 and up,4.1,Present


In [107]:
df["Android From"].value_counts()

Android From
4.1                   2380
4.0.3                 1453
4.0                   1338
Varies with device    1223
4.4                    894
2.3                    643
5.0                    550
4.2                    387
2.3.3                  279
2.2                    240
3.0                    237
4.3                    235
2.1                    133
1.6                    116
6.0                     58
7.0                     43
3.2                     36
2.0                     32
5.1                     22
1.5                     20
4.4W                    11
3.1                     10
2.0.1                    7
8.0                      6
7.1                      3
1.0                      2
Name: count, dtype: int64

In [108]:
df["Android To"].value_counts()

Android To
Present               9126
Varies with device    1223
7.1.1                    6
8.0                      2
6.0                      1
Name: count, dtype: int64

In [143]:
df.drop("Android Ver", axis = 1)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Android From,Android To,Genres_1,Genres_2
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,4.0.3,Present,Art & Design,
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,4.0.3,Present,Art & Design,Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,4.0.3,Present,Art & Design,
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,4.2,Present,Art & Design,
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,4.4,Present,Art & Design,Creativity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38.0,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,4.1,Present,Education,
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4.0,3.6,100,Free,0.0,Everyone,Education,2018-07-06,4.1,Present,Education,
10838,Parkinson Exercices FR,MEDICAL,,3.0,9.5,1000,Free,0.0,Everyone,Medical,2017-01-20,2.2,Present,Medical,
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114.0,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,Books & Reference,


In [109]:
df["Category"].value_counts()

Category
FAMILY                 1943
GAME                   1121
TOOLS                   843
BUSINESS                427
MEDICAL                 408
PRODUCTIVITY            407
PERSONALIZATION         388
LIFESTYLE               373
COMMUNICATION           366
FINANCE                 360
SPORTS                  351
PHOTOGRAPHY             323
HEALTH_AND_FITNESS      306
SOCIAL                  280
NEWS_AND_MAGAZINES      264
TRAVEL_AND_LOCAL        237
BOOKS_AND_REFERENCE     230
SHOPPING                224
DATING                  196
VIDEO_PLAYERS           175
MAPS_AND_NAVIGATION     137
EDUCATION               130
FOOD_AND_DRINK          124
ENTERTAINMENT           111
LIBRARIES_AND_DEMO       85
AUTO_AND_VEHICLES        85
WEATHER                  82
HOUSE_AND_HOME           80
ART_AND_DESIGN           65
EVENTS                   64
COMICS                   60
PARENTING                60
BEAUTY                   53
Name: count, dtype: int64

Category column is clean

In [110]:
df["Type"].value_counts()

Type
Free    9592
Paid     765
Name: count, dtype: int64

"Type" Column is clean

In [111]:
df["Genres"].value_counts()

Genres
Tools                       842
Entertainment               588
Education                   527
Business                    427
Medical                     408
                           ... 
Role Playing;Brain Games      1
Strategy;Education            1
Racing;Pretend Play           1
Communication;Creativity      1
Strategy;Creativity           1
Name: count, Length: 119, dtype: int64

In [112]:
df["Genres"].unique()

array(['Art & Design', 'Art & Design;Pretend Play',
       'Art & Design;Creativity', 'Art & Design;Action & Adventure',
       'Auto & Vehicles', 'Beauty', 'Books & Reference', 'Business',
       'Comics', 'Comics;Creativity', 'Communication', 'Dating',
       'Education;Education', 'Education', 'Education;Creativity',
       'Education;Music & Video', 'Education;Action & Adventure',
       'Education;Pretend Play', 'Education;Brain Games', 'Entertainment',
       'Entertainment;Music & Video', 'Entertainment;Brain Games',
       'Entertainment;Creativity', 'Events', 'Finance', 'Food & Drink',
       'Health & Fitness', 'House & Home', 'Libraries & Demo',
       'Lifestyle', 'Lifestyle;Pretend Play',
       'Adventure;Action & Adventure', 'Arcade', 'Casual', 'Card',
       'Casual;Pretend Play', 'Action', 'Strategy', 'Puzzle', 'Sports',
       'Music', 'Word', 'Racing', 'Casual;Creativity',
       'Casual;Action & Adventure', 'Simulation', 'Adventure', 'Board',
       'Trivia', 'Role 

### "Genres" need to be cleaned as some columns have more than two types. It has 119 entries.

In [113]:
split_columns = df["Genres"].str.split(";", expand= True)
split_columns.columns = [f"Genres_{i+1}" for i in range (split_columns.shape[1])]
df = pd.concat([df, split_columns], axis=1)

df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Android From,Android To,Genres_1,Genres_2
7152,ARK: Survival Evolved,GAME,3.8,51523.0,36.0,500000,Free,0.0,Teen,Adventure,2018-08-02,1.0.90,7.0 and up,7.0,Present,Adventure,
10216,MB Notifications for FB (Free),SOCIAL,3.9,3047.0,1.5,100000,Free,0.0,Teen,Social,2015-04-29,1.17.0,2.3 and up,2.3,Present,Social,
10727,Fon WiFi App – WiFi Connect,TOOLS,4.1,222.0,16.0,50000,Free,0.0,Everyone,Tools,2018-07-26,2.2.4,4.4 and up,4.4,Present,Tools,
8423,Bike Unchained,SPORTS,4.3,83545.0,23.0,5000000,Free,0.0,Everyone,Sports,2018-07-31,1.193,4.0 and up,4.0,Present,Sports,
6808,BT Speed,SPORTS,,3.0,0.925781,10,Paid,4.8,Everyone,Sports,2015-05-21,1.0.1,4.0.3 and up,4.0.3,Present,Sports,


In [114]:
df["Genres_1"].value_counts().shape

(48,)

In [115]:
df["Genres_1"].value_counts().sort_index()

Genres_1
Action                     371
Adventure                   91
Arcade                     234
Art & Design                69
Auto & Vehicles             85
Beauty                      53
Board                       63
Books & Reference          233
Business                   427
Card                        51
Casino                      39
Casual                     263
Comics                      60
Communication              367
Dating                     196
Education                  610
Educational                106
Entertainment              628
Events                      64
Finance                    360
Food & Drink               124
Health & Fitness           308
House & Home                80
Libraries & Demo            85
Lifestyle                  374
Maps & Navigation          137
Medical                    408
Music                       24
Music & Audio                1
News & Magazines           264
Parenting                   60
Personalization            388

In [116]:
df[df["Genres_1"] == "Educational"]["Genres_1"] = "Education"
df[df["Genres_1"] == "Education"].value_counts()

App                                   Category   Rating  Reviews  Size  Installs  Type  Price  Content Rating  Genres                  Last Updated  Current Ver  Android Ver   Android From  Android To  Genres_1   Genres_2    
ABC Preschool Free                    EDUCATION  3.8     27572.0  25.0  5000000   Free  0.00   Everyone        Education;Education     2017-10-25    3.0          2.3 and up    2.3           Present     Education  Education       1
ABCmouse.com                          FAMILY     4.3     50887.0  91.0  5000000   Free  0.00   Everyone        Education;Education     2018-07-03    7.2.0        4.4 and up    4.4           Present     Education  Education       1
Arabic Alif Ba Ta For Kids            FAMILY     4.5     226.0    26.0  100000    Free  0.00   Everyone        Education;Education     2017-05-30    1.0.0        2.3 and up    2.3           Present     Education  Education       1
Avokiddo Emotions                     FAMILY     4.6     73.0     12.0  1000     

In [117]:
df["Genres_1"].value_counts()

Genres_1
Tools                      843
Entertainment              628
Education                  610
Business                   427
Medical                    408
Productivity               407
Personalization            388
Lifestyle                  374
Action                     371
Sports                     368
Communication              367
Finance                    360
Photography                322
Health & Fitness           308
Social                     280
News & Magazines           264
Casual                     263
Travel & Local             237
Arcade                     234
Books & Reference          233
Shopping                   224
Simulation                 216
Dating                     196
Video Players & Editors    178
Puzzle                     162
Maps & Navigation          137
Food & Drink               124
Role Playing               120
Racing                     119
Strategy                   109
Educational                106
Adventure                   91

In [118]:
df["Genres_2"].value_counts()

Genres_2
Action & Adventure    128
Education             107
Pretend Play           80
Brain Games            69
Music & Video          41
Creativity             37
Name: count, dtype: int64

In [119]:
df["Genres_1"] = df["Genres_1"].replace(["Music", "Music & Audio"], "Music & Video")

df["Genres_1"].value_counts()

Genres_1
Tools                      843
Entertainment              628
Education                  610
Business                   427
Medical                    408
Productivity               407
Personalization            388
Lifestyle                  374
Action                     371
Sports                     368
Communication              367
Finance                    360
Photography                322
Health & Fitness           308
Social                     280
News & Magazines           264
Casual                     263
Travel & Local             237
Arcade                     234
Books & Reference          233
Shopping                   224
Simulation                 216
Dating                     196
Video Players & Editors    178
Puzzle                     162
Maps & Navigation          137
Food & Drink               124
Role Playing               120
Racing                     119
Strategy                   109
Educational                106
Adventure                   91

In [120]:
df[df["Genres_1"].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Android From,Android To,Genres_1,Genres_2
10472,Life Made WI-Fi Touchscreen Photo Frame,PHOTOGRAPHY,1.9,19.0,3.0,1000,Free,0.0,Everyone,,2018-02-11,1.0.19,4.0 and up,4.0,Present,,


In [121]:
df.loc[10472, "Genres_1"] = "Photography"
df.loc[10472]

App               Life Made WI-Fi Touchscreen Photo Frame
Category                                      PHOTOGRAPHY
Rating                                                1.9
Reviews                                              19.0
Size                                                  3.0
Installs                                             1000
Type                                                 Free
Price                                                 0.0
Content Rating                                   Everyone
Genres                                                NaN
Last Updated                          2018-02-11 00:00:00
Current Ver                                        1.0.19
Android Ver                                    4.0 and up
Android From                                          4.0
Android To                                        Present
Genres_1                                      Photography
Genres_2                                              NaN
Name: 10472, d

In [132]:
df[df["Genres_1"] == df["Genres_2"]].count()

App               48
Category          48
Rating            48
Reviews           48
Size              41
Installs          48
Type              48
Price             48
Content Rating    48
Genres            48
Last Updated      48
Android Ver       48
Android From      48
Android To        48
Genres_1          48
Genres_2          48
dtype: int64

In [138]:
df.loc[df["Genres_1"] == df["Genres_2"],"Genres_2"] = pd.NA
df[df["Genres_1"] == df["Genres_2"]].head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Android Ver,Android From,Android To,Genres_1,Genres_2


In [139]:
df["Genres_2"].value_counts()

Genres_2
Action & Adventure    128
Pretend Play           80
Brain Games            69
Education              63
Creativity             37
Music & Video          37
Name: count, dtype: int64

In [146]:
df.drop(["Genres", "Android Ver"], axis= 1, inplace= True)

### Genres fixed into two columns with no duplicates

In [124]:
df["Current Ver"].value_counts()

Current Ver
Varies with device    1302
1.0                    802
1.1                    260
1.2                    177
2.0                    149
                      ... 
Android 3.0 - 2017       1
3.36                     1
2.9.2                    1
6.1.61.1                 1
2.0.148.0                1
Name: count, Length: 2831, dtype: int64

In [125]:
# checking whether there are lines that start with alphbetical letter (rather than number)
df[df["Current Ver"].str[0].str.isalpha().fillna(False)]["Current Ver"].value_counts()

  df[df["Current Ver"].str[0].str.isalpha().fillna(False)]["Current Ver"].value_counts()


Current Ver
Varies with device    1302
Android 3.0 - 2015       7
Cow V3.15                5
v1.0                     3
v3.7.93                  2
                      ... 
V4.1.0                   1
v2.7.11.16               1
FH CODE 1.0              1
v4.30.0.9                1
Android 3.0 - 2017       1
Name: count, Length: 125, dtype: int64

### Too many values, not required to be used for any analytics ... to be dropped

In [126]:
df = df.drop("Current Ver", axis = 1)

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10358 non-null  object        
 1   Category        10358 non-null  object        
 2   Rating          8893 non-null   float64       
 3   Reviews         10358 non-null  float64       
 4   Size            8832 non-null   float64       
 5   Installs        10358 non-null  int64         
 6   Type            10357 non-null  object        
 7   Price           10358 non-null  float64       
 8   Content Rating  10358 non-null  object        
 9   Genres          10357 non-null  object        
 10  Last Updated    10358 non-null  datetime64[ns]
 11  Android Ver     10356 non-null  object        
 12  Android From    10358 non-null  object        
 13  Android To      10358 non-null  object        
 14  Genres_1        10358 non-null  object        
 15  Genres_

# Working on the requirments

### Fixing Rating

In [128]:
df[df["Rating"].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Android Ver,Android From,Android To,Genres_1,Genres_2
23,Mcqueen Coloring pages,ART_AND_DESIGN,,61.0,7.0,100000,Free,0.0,Everyone,Art & Design;Action & Adventure,2018-03-07,4.1 and up,4.1,Present,Art & Design,Action & Adventure
113,Wrinkles and rejuvenation,BEAUTY,,182.0,5.7,100000,Free,0.0,Everyone 10+,Beauty,2017-09-20,3.0 and up,3.0,Present,Beauty,
123,Manicure - nail design,BEAUTY,,119.0,3.7,50000,Free,0.0,Everyone,Beauty,2018-07-23,4.1 and up,4.1,Present,Beauty,
126,Skin Care and Natural Beauty,BEAUTY,,654.0,7.4,100000,Free,0.0,Teen,Beauty,2018-07-17,4.1 and up,4.1,Present,Beauty,
129,"Secrets of beauty, youth and health",BEAUTY,,77.0,2.9,10000,Free,0.0,Mature 17+,Beauty,2017-08-08,2.3 and up,2.3,Present,Beauty,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10824,Cardio-FR,MEDICAL,,67.0,82.0,10000,Free,0.0,Everyone,Medical,2018-07-31,4.4 and up,4.4,Present,Medical,
10825,Naruto & Boruto FR,SOCIAL,,7.0,7.7,100,Free,0.0,Teen,Social,2018-02-02,4.0 and up,4.0,Present,Social,
10831,payermonstationnement.fr,MAPS_AND_NAVIGATION,,38.0,9.8,5000,Free,0.0,Everyone,Maps & Navigation,2018-06-13,4.0 and up,4.0,Present,Maps & Navigation,
10835,FR Forms,BUSINESS,,0.0,9.6,10,Free,0.0,Everyone,Business,2016-09-29,4.0 and up,4.0,Present,Business,


### Nothing to be done with "Rating" until it is know what to be done with rating.

In [147]:
df.isna().sum()

App                  0
Category             0
Rating            1465
Reviews              0
Size              1526
Installs             0
Type                 1
Price                0
Content Rating       0
Last Updated         0
Android From         0
Android To           0
Genres_1             0
Genres_2          9944
dtype: int64

### checking and fixing "Type"

In [148]:
df[df["Type"].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated,Android From,Android To,Genres_1,Genres_2
9148,Command & Conquer: Rivals,FAMILY,,0.0,,0,,0.0,Everyone 10+,2018-06-28,Varies with device,Varies with device,Strategy,


In [153]:
df["Type"].fillna("Free", inplace= True) # As price is 0
df["Type"].value_counts()

Type
Free    9593
Paid     765
Name: count, dtype: int64

# Analytical Questions

## 1.	What is the most expensive app on the Play Store?
I'm Rich - Trump Edition 
400 USD

In [164]:
App = df[df["Price"] == df["Price"].max()]["App"]
Price = df[df["Price"] == df["Price"].max()]["Price"]
print(
    f"""
App Name: {str(App)}
App Price: {str(Price)}
"""
)


App Name: 4367    I'm Rich - Trump Edition
Name: App, dtype: object
App Price: 4367    400.0
Name: Price, dtype: float64



## 2.	Which genre has the highest number of apps?
Tools with 843 of listed Apps


In [171]:
pd.concat([df["Genres_1"],df["Genres_2"]]).dropna().value_counts()

Tools                      843
Education                  673
Entertainment              628
Business                   427
Medical                    408
Productivity               407
Personalization            388
Lifestyle                  374
Action                     371
Sports                     368
Communication              367
Finance                    360
Photography                323
Health & Fitness           308
Social                     280
News & Magazines           264
Casual                     263
Travel & Local             237
Arcade                     234
Books & Reference          233
Shopping                   224
Simulation                 216
Dating                     196
Video Players & Editors    178
Puzzle                     162
Maps & Navigation          137
Action & Adventure         128
Food & Drink               124
Role Playing               120
Racing                     119
Strategy                   109
Educational                106
Adventur

## 3.	What is the average size of free vs. paid apps?
Paid: 19.12 M
Free: 21.47 M

In [177]:
# Average size of paid apps
df[df["Type"] =="Paid"]["Size"].mean()

np.float64(19.12365570730825)

In [178]:
df[df["Type"] =="Free"]["Size"].mean()

np.float64(21.468824384769224)

## 4.	What are the top 5 most expensive apps with a perfect rating (5)?
AP Art History Flashcards
USMLE Step 2 CK Flashcards
meStudying: AP English Lit
TI-84 CE Graphing Calculator Manual TI 84
Hey AJ! It's Bedtime!

In [185]:
df[df["Rating"] == 5].sort_values(by = "Price", ascending=False).head(5)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated,Android From,Android To,Genres_1,Genres_2
5489,AP Art History Flashcards,FAMILY,5.0,1.0,96.0,10,Paid,29.99,Mature 17+,2016-01-19,4.0,Present,Education,
7477,USMLE Step 2 CK Flashcards,FAMILY,5.0,1.0,40.0,10,Paid,19.99,Everyone,2014-05-26,2.2,Present,Education,
5482,meStudying: AP English Lit,FAMILY,5.0,1.0,0.639648,10,Paid,4.99,Everyone,2013-08-31,2.0.1,Present,Education,
7204,TI-84 CE Graphing Calculator Manual TI 84,FAMILY,5.0,1.0,27.0,100,Paid,4.99,Everyone,2018-03-28,4.1,Present,Education,
5246,Hey AJ! It's Bedtime!,FAMILY,5.0,1.0,63.0,10,Paid,4.99,Everyone,2018-04-04,2.3,Present,Education,


## 5.	How many apps have received more than 50K reviews?
2525 Apps

In [189]:
df[df["Reviews"] >= 50000]["Reviews"].count()

np.int64(2525)

## 6.	What is the average price of apps, grouped by genre and number of installs?
below are the results

In [None]:
# Paid Apps Group by both genres and installs
df[df["Type"] == "Paid"].groupby(["Genres_1", "Installs"])["Price"].mean()

Genres_1  Installs
Action    50          1.990000
          100         2.740000
          500         1.990000
          1000        1.490000
          10000       5.275714
                        ...   
Weather   500         2.490000
          1000        6.990000
          10000       2.990000
          100000      3.823333
          500000      5.990000
Name: Price, Length: 271, dtype: float64

In [195]:
# Paid Apps Group by genres
df[df["Type"] == "Paid"].groupby("Genres_1")["Price"].mean()

Genres_1
Action                       3.415926
Adventure                    4.456667
Arcade                       2.781667
Art & Design                 1.990000
Auto & Vehicles              4.490000
Board                        3.052222
Books & Reference            4.225000
Business                    14.607500
Card                         2.427500
Casino                      14.000000
Casual                       3.302500
Communication                3.079259
Dating                       4.490000
Education                    5.250000
Educational                  3.354286
Entertainment               87.635789
Events                     109.990000
Finance                    170.637059
Food & Drink                 4.240000
Health & Fitness             4.290000
Libraries & Demo             0.990000
Lifestyle                  124.256316
Maps & Navigation            5.390000
Medical                     12.151071
Music & Video                1.990000
News & Magazines             1.990000
Par

In [196]:
# Paid Apps Group by installs
df[df["Type"] == "Paid"].groupby( "Installs")["Price"].mean()

Installs
0           120.541000
1             8.537619
5             5.948333
10            6.396571
50            3.227059
100           8.508298
500           3.365500
1000         17.429209
5000         21.799692
10000        20.155932
50000        14.338293
100000        9.582235
500000        3.198333
1000000       2.585238
10000000      4.990000
Name: Price, dtype: float64

## 7.	How many apps have a rating higher than 4.7, and what is their average price?
Apps with rating more than 4.7 are 586
- Average Price of Apps with rating more than 4.7 is 0.455 USD
- Average Price of paid Apps with rating more than 4.7 is 3.81 USD

In [198]:
df[df["Rating"] > 4.7]["Rating"].count()

np.int64(586)

In [199]:
df[df["Rating"] > 4.7]["Price"].mean()

np.float64(0.455)

In [200]:
df[(df["Rating"] > 4.7) & (df["Type"]=="Paid")]["Price"].mean()

np.float64(3.8090000000000006)

## 8.	What is Google's estimated revenue from apps with 5,000,000+ installs?
(Assuming Google takes a 30% cut from app sales)
- For Apps with installs over 5,000,000 and less than 10,000,000 : 0 USD
- For Apps with installs over 5,000,000: 44,910,000 USD


In [222]:
# For installs over 5,000,000 and less than 10,000,000 - assuming all are 5,000,000
f"{float(df[df["Installs"] == 5000000].eval("(Price * Installs)").sum()*.3):,.2f}"


'0.00'

In [221]:
df[df["Installs"] == 5000000]["Type"].unique()

array(['Free'], dtype=object)

all the apps having 5,000,000 installs are free - 0 USD

In [223]:
# For installs over 5,000,000 - assuming the quantity of installs are the same  as written (not more)
f"{float(df[df["Installs"] > 5000000].eval("(Price * Installs)").sum()*.3):,.2f}"


'44,910,000.00'

## 9.	What are the maximum and minimum sizes of free vs. paid apps?
- Maximum Free: 100 MB
- Minimum Free: 8.5 KB
- Maximum Paid: 100 MB
- Minimum Paid: 14 KB

In [224]:
df[df["Type"] == "Free"]["Size"].max()

np.float64(100.0)

In [228]:
df[df["Type"] == "Free"]["Size"].min()*1024

np.float64(8.5)

In [226]:
df[df["Type"] == "Paid"]["Size"].max()

np.float64(100.0)

In [229]:
df[df["Type"] == "Paid"]["Size"].min()*1024

np.float64(14.0)

## 10.	Is there a correlation between an app’s rating, number of reviews, size, and its price?

No, weak correlation at all as the correlation between them is less than 0.25

In [273]:
df_num = df.select_dtypes(["float", "int"])
df_num.sample(5)

Unnamed: 0,Rating,Reviews,Size,Installs,Price
3852,4.2,301413.0,48.0,5000000,0.0
4321,4.6,2694969.0,78.0,100000000,0.0
6498,3.9,8185.0,45.0,500000,0.0
446,4.2,15287.0,17.0,1000000,0.0
2536,4.4,65.0,37.0,10000,0.0


In [275]:
df_num.corr()

Unnamed: 0,Rating,Reviews,Size,Installs,Price
Rating,1.0,0.068738,0.082339,0.050925,-0.022301
Reviews,0.068738,1.0,0.237896,0.634998,-0.009416
Size,0.082339,0.237896,1.0,0.16888,-0.023762
Installs,0.050925,0.634998,0.16888,1.0,-0.011146
Price,-0.022301,-0.009416,-0.023762,-0.011146,1.0


## 11.	How many apps exist for each type (free/paid) across different content ratings?


In [None]:
df.groupby("Type")["Content Rating"].value_counts()

Type  Content Rating 
Free  Everyone           7721
      Teen               1094
      Mature 17+          428
      Everyone 10+        345
      Adults only 18+       3
      Unrated               2
Paid  Everyone            662
      Teen                 52
      Everyone 10+         32
      Mature 17+           19
Name: count, dtype: int64

## 12.	How many apps are compatible with Android version 4.x?
8453


In [267]:
def check_version_before(text):
    if text[0].isnumeric():
        if int(text[0]) <= 4:
            return True
        else:
            return False
    else:
        return False

def check_version_after(text):
    if text[0].isnumeric():
        if int(text[0]) >= 4:
            return True
        else:
            return False
    elif text.strip() == "Present":
        return True
    else:
        return False
    
df["Android4"] = (
    df["Android From"].apply(check_version_before) &
    df["Android To"].apply(check_version_after)
)

df[df["Android4"]==True]["Android4"].count()

np.int64(8453)