# Downloading apple dataset from kaggle

__Purpose__ 

This notebook is to download apple dataset from kaggle. Genres will be "normalized".

In [1]:
import subprocess
from os.path import isfile, exists
import pandas as pd
import numpy as np

In [2]:
apple_kaggle_dataset = '1400_kaggle_dataset_apple.csv'

# Download & Unzip the Raw Google Play Dataset

To avoid logging in kaggle, the original dataset is put on github to be dowdnloaded. Dataset will first be downloaded and then unzipped. We don't download the dataset everytime. Once it is kept in dataset folder, we only need to open it.

In [3]:
datasets_dir = '../../datasets/'
force_download = False 

In [4]:
apple_dataset = 'https://raw.githubusercontent.com/EloiseXu/Data-Science-in-Practice/master/datasets/AppleStore.csv'
local_filename = apple_dataset.split('/')[-1]
apple_filename = datasets_dir + local_filename

if not(isfile(apple_filename)) or force_download:
    curl_cmd = "curl -L {} --output {}".format(apple_dataset, apple_filename)
    subprocess.run(curl_cmd.split())
    
    gunzip_cmd = "gunzip {} -qq".format(apple_filename)
    subprocess.run(gunzip_cmd.split())

In [5]:
apple_apps = pd.read_csv(apple_filename)
apple_apps.shape

(11100, 17)

In [6]:
apple_apps.sample(10)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,game_enab
5702,1047566421,BFB Champions 2.0 ~Football Club Manager~,197970944,USD,0.0,33,0,4.0,0.0,2.1.0,4+,Games,38,5,3,1,0
5589,1042770650,ROCKY™,345278464,USD,0.0,989,132,4.0,4.5,1.1,9+,Games,38,5,1,1,0
4686,996898879,,0,,0.0,0,0,0.0,0.0,,,,1,1,1,0,0
3293,891700302,BCM Surfing Game - World Surf Tour,188578816,USD,2.99,81,7,4.0,5.0,3.4,4+,Games,38,5,1,1,0
9449,1131998468,,0,,0.0,0,0,0.0,0.0,,,,1,1,1,0,0
7035,1078214852,金钱谷软件-活期手机金融投资理财平台!,19781632,USD,0.0,0,0,0.0,0.0,1.1.4,4+,Finance,38,0,2,1,0
1469,517271093,Virtua Tennis Challenge,401727488,USD,4.99,1339,459,4.0,4.0,1.2,4+,Games,39,4,9,1,0
672,400213892,Reiner Knizia's Ra,34992128,USD,3.99,1114,110,4.5,4.0,1.5.1,4+,Games,45,5,1,1,0
7811,1094553959,,0,,0.0,0,0,0.0,0.0,,,,1,1,1,0,0
493,376197239,Météo-France,124090368,USD,0.0,24,2,3.5,5.0,5.7.1150,4+,Weather,37,4,3,1,0


In [7]:
apple_apps.isnull().sum()

id                     0
track_name          3903
size_bytes             0
currency            3903
price                  0
rating_count_tot       0
rating_count_ver       0
user_rating            0
user_rating_ver        0
ver                 3903
cont_rating         3903
prime_genre         3903
sup_devices.num        0
ipadSc_urls.num        0
lang.num               0
vpp_lic                0
game_enab              0
dtype: int64

# Finding apps with valid titles & genres

We need to pick up apps that are released on both platforms based on their titles. Thus we won't keep those apps without a valid title.

In [8]:
apple_apps = apple_apps.dropna(subset=['track_name'])
apple_apps.shape

(7197, 17)

Versions and number of languages of those apps are not helpful in this project. We only keep apps' title, genre, rating, size, etc.

In [9]:
col_n = ['id', 'track_name', 'size_bytes', 'price', 'rating_count_tot', 'user_rating', 'cont_rating', 'prime_genre'] 
apple_apps = pd.DataFrame(apple_apps, columns = col_n)

In [10]:
apple_apps.groupby('prime_genre').count()

Unnamed: 0_level_0,id,track_name,size_bytes,price,rating_count_tot,user_rating,cont_rating
prime_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Book,112,112,112,112,112,112,112
Business,57,57,57,57,57,57,57
Catalogs,10,10,10,10,10,10,10
Education,453,453,453,453,453,453,453
Entertainment,535,535,535,535,535,535,535
Finance,104,104,104,104,104,104,104
Food & Drink,63,63,63,63,63,63,63
Games,3862,3862,3862,3862,3862,3862,3862
Health & Fitness,180,180,180,180,180,180,180
Lifestyle,144,144,144,144,144,144,144


# Merging Genres

We merge some genres so there will be enough apps in each genre. 

In [11]:
def normalise_genre(genre):
    genre = str(genre)
    std = ['Utilities', 'Auto & Vehicles', 'Food & Drink', 'Health & Fitness', 'Lifestyle', 'Games', 'Books & Reference', 'Business', 'Entertainment', 'Social Networking', 'Education', 'News', 'Others']
    genre2cat = {'Utilities':0, 'Navigation': 1, ''
                'Travel': 1, 
                'Food & Drink': 2,
                'Health & Fitness':3,
                'Sports': 3, 'Medical': 3, 'Lifestyle': 4, 'Shopping': 4,
                'Games': 5,
                'Book': 6, 'Reference': 6,
                'Finance': 7, 'Business': 7, 'Productivity': 7,
                'Entertainment': 8, 'Music': 8,
                'Social Networking': 9,
                'Photo & Video': 9,
                'Education':10, 
                'News': 11,
                'Catalogs': 12, 'Weather': 12}
    return std[genre2cat[genre.strip()]]

In [12]:
apple_apps['prime_genre'] = apple_apps['prime_genre'].map(normalise_genre)

# Save Final Dataset

In [13]:
apple_apps.to_csv(datasets_dir + apple_kaggle_dataset, index=False)
apple_apps.shape

(7197, 8)