# Google Dataset 
Exploratory Data Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px 

## Data Loading and Exploration

In [2]:
df = pd.read_csv('../data/googleplaystore.csv')

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10839 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


In [5]:
df.shape

(10841, 13)

## Statistical Analysis 

In [61]:
df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price
count,9367.0,10841.0,9146.0,10841.0,10841.0
mean,4.191513,444111.9,21.514141,15462910.0,1.027273
std,0.515735,2927629.0,22.588679,85025570.0,15.948971
min,1.0,0.0,0.008301,0.0,0.0
25%,4.0,38.0,4.9,1000.0,0.0
50%,4.3,2094.0,13.0,100000.0,0.0
75%,4.5,54768.0,30.0,5000000.0,0.0
max,5.0,78158310.0,100.0,1000000000.0,400.0


## Total Columns

In [7]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

## Data Types of Columns

In [14]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

## 1.Encode Size Column

In [11]:
df['Size'] = df['Size'].replace('Varies with device', np.nan)
df['Size'] = df['Size'].apply(lambda x: float(x.replace('k', ''))/1024 if 'k' in str(x) else float(x.replace('M', '')) if 'M' in str(x) else float(x))
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [64]:
df['Size'].dtype

dtype('float64')

## 2.Encode Installs Column

In [42]:
df['Installs']=df['Installs'].apply(lambda x:x.replace(",","") if "," in x else x)
df['Installs']=df['Installs'].apply(lambda x:x.replace("+","") if "+" in x else x)

In [63]:
df['Installs'].dtype

dtype('float64')

## 3.Encode Price Column

In [44]:
df['Price']=df['Price'].str.replace("$","")

In [60]:
df['Price'] = df['Price'].apply(lambda x:float(x))

In [62]:
df['Price'].dtype

dtype('float64')

---
# Plots 

## 1.Bar Plot 

In [111]:
px.bar(df,x='Category',y='Price',color='Genres')

## 2.Area Plot

In [114]:
px.area(df,x='Installs',y='Reviews')

## 3.Scatter Plot 

In [92]:
px.scatter(df,y='Installs',x='Size',color='Category')