# Exploratory Data Analysis on Playstore Database

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

> ### **Step - 2 : Data loading , exploring , cleaning**

- ##### **Data Loading**

In [2]:
df = pd.read_csv('googleplaystore.csv')

- ##### **Note**

In [3]:
# Printing all columns upto max
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
# Hide all warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [5]:
df.shape

(10841, 13)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [7]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [8]:
df.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


- ##### **If we observer in dataset , we will get to know rating,reviews,size,install,price columns are meant to be numeric . SO we have to make them numeric first.**

----
- ##### **Let's first target Size column**
> Observations
- "Varies with device"
- M
- k
- '+'
- ---
> This means we've to replace above values by a number

In [9]:
df['Size'].isnull().sum()
# No null value

0

In [10]:
# Find the values having M in them
total_m = df['Size'].loc[df['Size'].str.contains('M',case=False)].value_counts().sum()

In [11]:
# Find the values having k in them
total_k = df['Size'].loc[df['Size'].str.contains('k',case=False)].value_counts().sum()

In [12]:
# Find the values having 'varies with device' in them
total_varies = df['Size'].loc[df['Size'].str.contains('Varies with device',case=False)].value_counts().sum()

In [13]:
# Find the values having '+' in them
total_add = df['Size'].loc[df['Size'].str.contains('\+')].value_counts().sum()

In [14]:
# Confirm that total values we get is equal to entries in dataframe
total_k + total_m + total_varies + total_add

10841

In [15]:
(len(df))

10841

- **Converting string into bytes**

In [16]:
def convertbytes(size):
    if isinstance(size,str):
        if 'k' in size:
            return float(size.replace('k',''))*1024
        elif 'M' in size:
            return float(size.replace('M',''))*1024*1024
        elif '\+' in size:
            return float(size.replace('\+',""))*1
        else:
            return np.nan
    return size
df['Size'] = df['Size'].apply(convertbytes)
df['Size'].dtypes

dtype('float64')

- **Now rename the size column**

In [17]:
df.rename(columns={'Size':'bytes_Size'},inplace=True)

In [18]:
df.head(1)

Unnamed: 0,App,Category,Rating,Reviews,bytes_Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19922944.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up


In [34]:
# Converting byte to MB
df['Size In MB'] = df['bytes_Size'].apply(lambda x: x / (1024*1024))

In [36]:
df.head(1)

Unnamed: 0,App,Category,Rating,Reviews,Size_MB,bytes_Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size In MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,Size in MB,19922944.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0


In [37]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MB', 'bytes_Size',
       'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated',
       'Current Ver', 'Android Ver', 'Size In MB'],
      dtype='object')

In [40]:
df.drop('Size_MB',axis=1,inplace=True)

In [41]:
df.head(1)

Unnamed: 0,App,Category,Rating,Reviews,bytes_Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size In MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19922944.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
