# Cleaning google dataset

__Purpose__

This notebook is to clean and then test google dataset.

In [1]:
from os.path import exists, isfile

import re

import pandas as pd
import numpy as np
import math
import great_expectations as ge

In [2]:
save_path = '../../datasets/2100_clean_google.csv'

In [3]:
if not exists("../../datasets/1300_kaggle_dataset_google.csv"):
    print ("Missing dataset file")
    
df_google=ge.read_csv("../../datasets/1300_kaggle_dataset_google.csv")
df_google.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Price,Content Rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,Utilities,4.1,159,19M,0,Everyone
1,Coloring book moana,Utilities,3.9,967,14M,0,Everyone
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Utilities,4.7,87510,8.7M,0,Everyone
3,Sketch - Draw & Paint,Utilities,4.5,215644,25M,0,Teen
4,Pixel Draw - Number Art Coloring Book,Utilities,4.3,967,2.8M,0,Everyone


# Genre Testing

We have previously normalized the genres. We expect that new genres will only be those we want.

In [4]:
valid_genre = set(['Utilities', 'Auto & Vehicles', 'Books & Reference', 'Business',
       'Entertainment', 'Social Networking', 'Education', 'News',
       'Food & Drink', 'Health & Fitness', 'Others', 'Lifestyle', 'Games'])

df_google.expect_column_values_to_be_in_set('Category', valid_genre)

{'success': True,
 'result': {'element_count': 9366,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': []}}

In [5]:
df_google.info()

<class 'great_expectations.dataset.pandas_dataset.PandasDataset'>
RangeIndex: 9366 entries, 0 to 9365
Data columns (total 7 columns):
App               9366 non-null object
Category          9366 non-null object
Rating            9366 non-null float64
Reviews           9366 non-null int64
Size              9366 non-null object
Price             9366 non-null object
Content Rating    9366 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 512.3+ KB


# Size

We want size to be a numerical feature. We want "size" to be a float, which is, "Megabytes" of each app.

In [6]:
def clean_size(x):
    x = str(x)
    if x == 'Varies with device':
        return np.nan
    else:
        return float(x[:-1])

df_google['Size'] = df_google['Size'].map(clean_size)

In [7]:
df_google['Size'].describe()

count    7729.000000
mean       37.284513
std        93.509493
min         1.000000
25%         6.100000
50%        16.000000
75%        37.000000
max       994.000000
Name: Size, dtype: float64

# Price

The currency of apps' prices is always USD. We want price to be a numerical feature as well.

In [8]:
def clean_price(x):
    x = str(x)
    if x=='0':
        return 0
    else:
        return float(x[1:])
    
df_google['Price'] = df_google['Price'].map(clean_price)

In [9]:
df_google['Price'].describe()

count    9366.000000
mean        0.960928
std        15.816585
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       400.000000
Name: Price, dtype: float64

# Content Rating

Since we will compare the content rating of apple apps and google apps later, we should have the same standard for both. Here we normalize those apps' content rating.

In [10]:
df_google['Content Rating'] = df_google['Content Rating'].replace({'Adults only 18+': 'Mature 17+'})
df_google['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+', 'Unrated'],
      dtype=object)

# Title testing

Every app must have a valid title. We also have to check if the titles are unique, because we are not sure if those apps with the same titles are actually different. If not, we only keep one of them.

In [11]:
df_google.expect_column_values_to_not_be_null('App')

{'success': True,
 'result': {'element_count': 9366,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'partial_unexpected_list': []}}

In [12]:
df_google.expect_column_values_to_be_unique('App')

{'success': False,
 'result': {'element_count': 9366,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 1959,
  'unexpected_percent': 0.20916079436258808,
  'unexpected_percent_nonmissing': 0.20916079436258808,
  'partial_unexpected_list': ['Coloring book moana',
   'UNICORN - Color By Number & Pixel Art Coloring',
   'Textgram - write on photos',
   'Wattpad 📖 Free Books',
   'Amazon Kindle',
   'Dictionary - Merriam-Webster',
   'NOOK: Read eBooks & Magazines',
   'Oxford Dictionary of English : Free',
   'Spanish English Translator',
   'NOOK App for NOOK Devices',
   'Ebook Reader',
   'English Dictionary - Offline',
   'Docs To Go™ Free Office Suite',
   'Google My Business',
   'OfficeSuite : Free Office + PDF Editor',
   'Curriculum vitae App CV Builder Free Resume Maker',
   'Facebook Pages Manager',
   'Box',
   'Call Blocker',
   'ZOOM Cloud Meetings']}}

In [13]:
df_google['Rating'].describe()

count    9366.000000
mean        4.191757
std         0.515219
min         1.000000
25%         4.000000
50%         4.300000
75%         4.500000
max         5.000000
Name: Rating, dtype: float64

We have some apps with the same titles. Here we find out why.

In [14]:
df_google[df_google['App']=='UNICORN - Color By Number & Pixel Art Coloring']

Unnamed: 0,App,Category,Rating,Reviews,Size,Price,Content Rating
35,UNICORN - Color By Number & Pixel Art Coloring,Utilities,4.7,8145,24.0,0.0,Everyone
6385,UNICORN - Color By Number & Pixel Art Coloring,Others,4.7,8264,24.0,0.0,Everyone


In [15]:
df_google[df_google['App']=='Textgram - write on photos']

Unnamed: 0,App,Category,Rating,Reviews,Size,Price,Content Rating
41,Textgram - write on photos,Utilities,4.4,295221,,0.0,Everyone
4080,Textgram - write on photos,Utilities,4.4,295237,,0.0,Everyone


In [16]:
df_google[df_google['App']=='Wattpad 📖 Free Books'][['Reviews', 'Rating']]

Unnamed: 0,Reviews,Rating
132,2914724,4.6
4518,2915189,4.6


It seems that apps with the same titles are the same apps of different version. We only keep one of them.

In [17]:
df_google[~df_google.duplicated(subset=['App'])].shape

(8196, 7)

In [18]:
df_google = df_google[~df_google.duplicated(subset=['App'])]
df_google.shape

(8196, 7)

In [19]:
df_google.expect_column_values_to_be_unique('App')

{'success': True,
 'result': {'element_count': 8196,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': []}}

# Ratings & Reviews

Ratings given by less than 3 users are not reliable. We only keep those apps with more than 3 users giving ratings to them.

In [20]:
min_reviews = 3
df_google = df_google[df_google['Reviews']>min_reviews]
df_google.shape

(7971, 7)

In [21]:
df_google.loc[df_google['Reviews']>0,'log_google_reviews'] = df_google[df_google['Reviews']>0]['Reviews'].apply(lambda x: math.log(x, 10))

In [22]:
df_google.expect_column_min_to_be_between('Reviews', min_reviews, 100000000)

{'success': True,
 'result': {'observed_value': 4,
  'element_count': 7971,
  'missing_count': 0,
  'missing_percent': 0.0}}

While we are dealing with ratings let's normalise google and apple ratings so that they are on the same scale. 

In [23]:
df_google['normed_rating'] = df_google['Rating']/df_google['Rating'].max()
df_google.expect_column_values_to_be_between('normed_rating', 0, 1)

{'success': True,
 'result': {'element_count': 7971,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_list': []}}

Also, we will calculate z scores.

In [24]:
def z_score(column, df):
    return (df[column] - df[column].mean())/df[column].std()

df_google['z_score'] = z_score('Rating', df_google)

In [25]:
df_google[df_google['Reviews'].isnull()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Price,Content Rating,log_google_reviews,normed_rating,z_score


Rename the columns.

In [26]:
df_google.columns=['google_title', 'google_genre', 'google_rating', 'google_reviews', 'google_size', 'google_price', 'google_pegi', 'log_google_reviews', 'normed_google_rating', 'z_score_google']

# Save Cleaned & Validated Dataset

In [27]:
df_google.to_csv(save_path, index=False)
df_google.shape

(7971, 10)