In [None]:


import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

df = pd.read_csv('googleplaystore.csv')
df.sample(5)
App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
78	Best Car Wallpapers	AUTO_AND_VEHICLES	4.5	994	2.9M	100,000+	Free	0	Everyone	Auto & Vehicles	27-Mar-18	1.6	4.1 and up
7154	Design innovation CD Cassette	LIFESTYLE	NaN	5	8.3M	500+	Free	0	Everyone	Lifestyle	6-Feb-17	1	2.3 and up
4395	Guns'n'Glory Zombies Premium	FAMILY	4.1	313	34M	5,000+	Paid	$2.99	Everyone 10+	Strategy	15-Jul-16	1.1.5	2.3.3 and up
3179	Southwest Airlines	TRAVEL_AND_LOCAL	3.9	24781	8.3M	5,000,000+	Free	0	Everyone	Travel & Local	11-Jul-18	5.6.1	6.0 and up
5083	AG Contacts, Premium edition	COMMUNICATION	4.5	88	3.7M	500+	Paid	$4.99	Everyone	Communication	18-May-18	8.05.18.14250	4.0.3 and up
Data Cleaning
1. Which of the following column(s) has/have null values?

df.isnull().sum()
App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64
2. Clean the Rating column and the other columns containing null values

df.loc[(df.Rating<0)|(df.Rating>5),'Rating']=None
df.Rating.fillna(df.Rating.mean(),inplace=True)
df.dropna(subset=['Type','Content Rating','Current Ver','Android Ver'],inplace=True)
3. Clean the column Reviews and make it numeric

def reviews(val):
    if 'M' in val:
        return float(val.strip('M'))*1_000_000
    elif 'k' in val:
        return float(val.strip('k'))*1000
    else:
        return val
df.Reviews = df.Reviews.apply(reviews)

df.Reviews = pd.to_numeric(df.Reviews,errors='coerce')
4. How many duplicated apps are there?

df['App'].duplicated(keep=False).sum()
1979
5. Drop duplicated apps keeping the ones with the greatest number of reviews

df_sorted = df.sort_values(by=['App','Reviews'])
df_sorted.drop_duplicates(subset=['App'], keep='last', inplace=True)
df = df_sorted
6. Format the Category column

df.Category = df.Category.apply(lambda a: a.capitalize().replace('_',' '))
​
7. Clean and convert the Installs column to numeric type

df.Installs = df.Installs.apply(lambda a: a.strip('+').replace(',',''))
df.Installs = df.Installs.astype('int64')
8. Clean and convert the Size column to numeric (representing bytes)

def size(val):
    if 'M' in val.replace(',','').replace('+',''):
        return float(val.strip('M'))*(1024**2)
    elif 'k' in val.replace(',','').replace('+',''):
        return float(val.strip('k'))*(1024)
    else:
        return 0
df.Size = df.Size.apply(size)
​
9. Clean and convert the Price column to numeric

df.Price = df.Price.apply(lambda a: a.replace('$',''))
df.Price = pd.to_numeric(df.Price, errors='coerce')
df.Price.fillna(0.0,inplace=True)
10. Paid or free?

df['Distribution'] = df.Price.apply(lambda a: 'Free' if a==0.0 else 'Paid')
Analysis
11. Which app has the most reviews?

df.loc[df.Reviews==df.Reviews.max(),'App']
2544    Facebook
Name: App, dtype: object
12. What category has the highest number of apps uploaded to the store?

df.groupby('Category')['App'].agg('count').idxmax()
'Family'
13. To which category belongs the most expensive app?

df.groupby('Category')['Price'].agg('max').idxmax()
'Lifestyle'
14. What's the name of the most expensive game?

df.loc[df.Category=='Game',['App','Price']].sort_values(by='Price',ascending=False).iloc[0]
App      The World Ends With You
Price                      17.99
Name: 4203, dtype: object
15. Which is the most popular Finance App?

df.loc[df.Category=='Finance',['App','Installs']].sort_values(by='Installs').iloc[-1]
App         Google Pay
Installs     100000000
Name: 5601, dtype: object
16. What Teen Game has the most reviews?

df.loc[(df.Category=='Game')&(df['Content Rating']=='Teen'),['App','Reviews']].sort_values(by='Reviews').iloc[-1]
App        Asphalt 8: Airborne
Reviews              8389714.0
Name: 3912, dtype: object
17. Which is the free game with the most reviews?

df.loc[(df.Distribution=='Free')&(df.Category=='Game'),['App','Reviews']].sort_values(by='Reviews').iloc[-1]
App        Clash of Clans
Reviews        44893888.0
Name: 1879, dtype: object
18. How many TB (terabytes) were transferred (overall) for the most popular Lifestyle app?

df.loc[df.Category=='Lifestyle',['App','Installs']].sort_values(by='Installs').iloc[-1]
App            Tinder
Installs    100000000
Name: 4587, dtype: object

(df.Size[df.App=='Tinder']*100_000_000)/(1024**4)
4587    6484.985352
Name: Size, dtype: float64