# Combining the google and apple datasets

__Purpose__

This notebook is going to bring together the google and apple datasets.

In [1]:
from os.path import exists, isfile
import random
import time

import re
import math

import pandas as pd
import numpy as np
import math

In [2]:
save_path = '../../datasets/2300_combine_kaggle_datasets.csv'

In [3]:
if not exists("../../datasets/2200_clean_apple.csv"):
    print ("Missing dataset file")
    
df_apple=pd.read_csv("../../datasets/2200_clean_apple.csv")
df_apple.head()

Unnamed: 0,apple_id,apple_title,apple_size,apple_price,apple_reviews,apple_rating,apple_pegi,apple_genre,normed_apple_rating,z_score_apple,log_apple_reviews
0,281656475,PAC-MAN Premium,96.119141,3.99,21292,4.0,Everyone,Games,0.8,-0.083987,4.328216
1,281796108,Evernote - stay organized,151.232422,0.0,161065,4.0,Everyone,Business,0.8,-0.083987,5.207001
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",95.867188,0.0,188583,3.5,Everyone,Others,0.7,-0.806018,5.275503
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",122.558594,0.0,262241,4.0,Teen,Lifestyle,0.8,-0.083987,5.418701
4,282935706,Bible,88.476562,0.0,985920,4.5,Everyone,Books & Reference,0.9,0.638043,5.993842


In [4]:
if not exists("../../datasets/2100_clean_google.csv"):
    print ("Missing dataset file")
    
df_google=pd.read_csv("../../datasets/2100_clean_google.csv")
df_google.head()

Unnamed: 0,google_title,google_genre,google_rating,google_reviews,google_size,google_price,google_pegi,log_google_reviews,normed_google_rating,z_score_google
0,Photo Editor & Candy Camera & Grid & ScrapBook,Utilities,4.1,159,19.0,0.0,Everyone,2.201397,0.82,-0.143135
1,Coloring book moana,Utilities,3.9,967,14.0,0.0,Everyone,2.985426,0.78,-0.537987
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Utilities,4.7,87510,8.7,0.0,Everyone,4.942058,0.94,1.041422
3,Sketch - Draw & Paint,Utilities,4.5,215644,25.0,0.0,Teen,5.333737,0.9,0.64657
4,Pixel Draw - Number Art Coloring Book,Utilities,4.3,967,2.8,0.0,Everyone,2.985426,0.86,0.251718


# Title to title mappings

The same apps can have different titles when they are released in different platforms. This happens when the title of each app is in the format like "A - B" or "A : B". However, some apps have titles like "A - B" or "A : B" but they are different. For example, different games in a series can have titles like "A - B", "A - C", and "A - D" but they cannot be considered as the same app.

A possible solution to this problem is, first take "A" out of "A - B" or "A : B" and save it as the key to combine two datasets. If such a key is unique in the new dataset, we simply consider it as a perfect match. However, when such a key is not unique, we only consider those matches with the same google title and apple title as the correct match.

In [5]:
def clean_title(x):
    x = str(x)
    pos1 = x.find('-')
    pos2 = x.find('–')
    pos3 = x.find(':')
    pos4 = x.find('(')
    if pos1 != -1:
        x = x[:pos1].strip()
    if pos2 != -1:
        x = x[:pos2].strip()
    if pos3 != -1:
        x = x[:pos3].strip()
    if pos4 != -1:
        x = x[:pos4].strip()
    r1 = '[’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    x = re.sub(r1, '', x)
    x = x.strip()
    return x

In [6]:
df_google['trim_title'] = df_google['google_title'].apply(clean_title)
df_apple['trim_title'] = df_apple['apple_title'].apply(clean_title)

In [7]:
combine_apps = df_apple.set_index('trim_title').join(
    df_google.set_index('trim_title'), how='inner').reset_index()
combine_apps.shape

(781, 22)

In [8]:
same_title = combine_apps[(combine_apps.duplicated('trim_title') == True) & (combine_apps['google_title'] == combine_apps['apple_title'])]
same_trim = combine_apps[combine_apps.duplicated('trim_title') == False]
combine_apps = same_title.append(same_trim)
combine_apps.sample(10)

Unnamed: 0,trim_title,apple_id,apple_title,apple_size,apple_price,apple_reviews,apple_rating,apple_pegi,apple_genre,normed_apple_rating,...,google_title,google_genre,google_rating,google_reviews,google_size,google_price,google_pegi,log_google_reviews,normed_google_rating,z_score_google
117,Carnivores,376412660,Carnivores: Dinosaur Hunter Pro,290.541016,2.99,18306,4.0,Teen,Games,0.8,...,Carnivores: Dinosaur Hunter,Games,4.2,62636,17.0,0.0,Teen,4.796824,0.84,0.054292
664,Toca Life,988318940,Toca Life: City,316.737305,2.99,3571,4.5,Everyone,Education,0.9,...,Toca Life: City,Education,4.7,31085,24.0,3.99,Everyone,4.492551,0.94,1.041422
470,PBS KIDS Video,435138734,PBS KIDS Video,55.021484,0.0,8651,3.5,Everyone,Education,0.7,...,PBS KIDS Video,Education,4.2,36212,,0.0,Everyone,4.558853,0.84,0.054292
44,Angry Birds Rio,420635506,Angry Birds Rio,131.912109,0.0,170843,4.5,Everyone,Games,0.9,...,Angry Birds Rio,Games,4.4,2610526,46.0,0.0,Everyone,6.416728,0.88,0.449144
180,DraftKings,710535379,"DraftKings - Daily Fantasy Golf, Baseball, & More",138.111328,0.0,20251,4.5,Mature 17+,Health & Fitness,0.9,...,DraftKings - Daily Fantasy Sports,Health & Fitness,4.5,50017,41.0,0.0,Mature 17+,4.699118,0.9,0.64657
17,Add,1187279979,Add-Ons Studio for Minecraft,21.933594,2.99,97,3.0,Everyone,Games,0.6,...,Add-On: Alcatel (h),Books & Reference,4.5,107,1.2,0.0,Everyone,2.029384,0.9,0.64657
757,YouCam Perfect,768469908,YouCam Perfect - Photo & Selfie Editor,125.537109,0.0,4293,4.5,Everyone,Social Networking,0.9,...,YouCam Perfect - Selfie Photo Editor,Books & Reference,4.5,1579287,,0.0,Everyone,6.198461,0.9,0.64657
416,My Talking Angela,909351158,My Talking Angela,380.30957,0.0,54549,4.5,Everyone,Games,0.9,...,My Talking Angela,Games,4.5,9881829,99.0,0.0,Everyone,6.994837,0.9,0.64657
108,Camera360,443354861,"Camera360 - Selfie Filter Camera, Photo Editor",153.693359,0.0,16729,4.5,Teen,Social Networking,0.9,...,Camera360: Selfie Photo Editor with Funny Sticker,Books & Reference,4.3,4865093,51.0,0.0,Everyone,6.687091,0.86,0.251718
154,DRAGON BALL Z DOKKAN BATTLE,951627425,DRAGON BALL Z DOKKAN BATTLE,111.095703,0.0,5362,4.0,Everyone 10+,Games,0.8,...,DRAGON BALL Z DOKKAN BATTLE,Games,4.1,650114,80.0,0.0,Teen,5.81299,0.82,-0.143135


In [9]:
combine_apps.shape

(604, 22)

# Difference between ratings

The difference of each app's two ratings is calculated here.

In [10]:
combine_apps['z_score_google_sub_apple'] = combine_apps['z_score_google'] - combine_apps['z_score_apple']
combine_apps.sample(5)

Unnamed: 0,trim_title,apple_id,apple_title,apple_size,apple_price,apple_reviews,apple_rating,apple_pegi,apple_genre,normed_apple_rating,...,google_genre,google_rating,google_reviews,google_size,google_price,google_pegi,log_google_reviews,normed_google_rating,z_score_google,z_score_google_sub_apple
650,Threema,578665578,Threema,23.770508,2.99,196,4.5,Everyone,Social Networking,0.9,...,Social Networking,4.5,51110,,2.99,Everyone,4.708506,0.9,0.64657,0.008527
685,Trivia Crack,651510680,Trivia Crack,253.185547,0.0,393469,4.5,Everyone,Games,0.9,...,Games,4.5,6427773,95.0,0.0,Everyone,6.808061,0.9,0.64657,0.008527
484,Peppa Pig,1179475725,Peppa Pig: Party Time,240.304688,0.99,9,2.0,Everyone,Education,0.4,...,Others,3.8,52,55.0,2.99,Everyone,1.716003,0.76,-0.735413,2.236697
123,Castle Clash,692669501,Castle Clash: Brave Squads,298.195312,0.0,47963,4.0,Everyone 10+,Games,0.8,...,Games,4.6,4578476,24.0,0.0,Everyone 10+,6.660721,0.92,0.843996,0.927983
307,GroupMe,392796698,GroupMe,63.766602,0.0,28260,4.5,Everyone,Social Networking,0.9,...,Social Networking,4.5,330761,,0.0,Everyone,5.519514,0.9,0.64657,0.008527


In [11]:
combine_apps['norm_google_sub_apple'] = combine_apps['normed_google_rating'] - combine_apps['normed_apple_rating']
combine_apps.sample(5)

Unnamed: 0,trim_title,apple_id,apple_title,apple_size,apple_price,apple_reviews,apple_rating,apple_pegi,apple_genre,normed_apple_rating,...,google_rating,google_reviews,google_size,google_price,google_pegi,log_google_reviews,normed_google_rating,z_score_google,z_score_google_sub_apple,norm_google_sub_apple
500,Plants vs Zombies™ 2,597986893,Plants vs. Zombies™ 2,93.944336,0.0,267394,4.5,Everyone 10+,Games,0.9,...,4.4,567632,15.0,0.0,Everyone 10+,5.754067,0.88,0.449144,-0.188899,-0.02
292,Google Photos,962194608,Google Photos - unlimited photo and video storage,152.792969,0.0,88742,5.0,Everyone,Social Networking,1.0,...,4.5,10858556,,0.0,Everyone,7.035772,0.9,0.64657,-0.713504,-0.1
290,Google Duo,1096918571,Google Duo - simple video calling,42.452148,0.0,1033,4.0,Everyone,Social Networking,0.8,...,4.6,2083237,,0.0,Everyone,6.318739,0.92,0.843996,0.927983,0.12
171,Doodle Jump,307727765,Doodle Jump,46.483398,0.99,395261,4.5,Everyone,Games,0.9,...,4.3,1083571,,0.0,Everyone,6.034857,0.86,0.251718,-0.386326,-0.04
326,HotelTonight,407690035,HotelTonight - Great Deals on Last Minute Hotels,75.657227,0.0,32341,4.5,Teen,Auto & Vehicles,0.9,...,4.4,57573,,0.0,Everyone,4.760219,0.88,0.449144,-0.188899,-0.02


# New columns

We only keep the common features for the new dataset.

In [12]:
use_cols = [
    'apple_id', 'trim_title', 'apple_title', 'apple_genre', 'apple_rating',
       'apple_reviews', 'apple_size', 'apple_pegi',
       'normed_apple_rating', 'google_title', 'google_rating',
       'google_reviews', 'google_size', 'google_price', 
       'normed_google_rating', 'log_google_reviews', 'log_apple_reviews',
    'z_score_google', 'z_score_apple', 'z_score_google_sub_apple', 'norm_google_sub_apple'
]

df = combine_apps[use_cols].copy()
df.head()

Unnamed: 0,apple_id,trim_title,apple_title,apple_genre,apple_rating,apple_reviews,apple_size,apple_pegi,normed_apple_rating,google_title,...,google_reviews,google_size,google_price,normed_google_rating,log_google_reviews,log_apple_reviews,z_score_google,z_score_apple,z_score_google_sub_apple,norm_google_sub_apple
107,898968647,Call of Duty®,Call of Duty®: Heroes,Games,4.5,179416,201.075195,Teen,0.9,Call of Duty®: Heroes,...,1604146,57.0,0.0,0.88,6.205244,5.253861,0.449144,0.638043,-0.188899,-0.02
170,1147297267,Dont Starve,Don't Starve: Shipwrecked,Games,3.5,495,604.341797,Everyone 10+,0.7,Don't Starve: Shipwrecked,...,1468,4.9,4.99,0.82,3.166726,2.694605,-0.143135,-0.806018,0.662884,0.12
223,352670055,F,F-Sim Space Shuttle,Games,4.5,6403,72.855469,Everyone,0.9,F-Sim Space Shuttle,...,5427,,4.99,0.88,3.73456,3.806384,0.449144,0.638043,-0.188899,-0.02
301,763692274,Grand Theft Auto,Grand Theft Auto: San Andreas,Games,4.0,32533,1964.96582,Mature 17+,0.8,Grand Theft Auto: San Andreas,...,348962,26.0,6.99,0.88,5.542778,4.512324,0.449144,-0.083987,0.533131,0.08
355,771989093,LEGO® Friends,LEGO® Friends,Games,4.0,400,730.941406,Everyone,0.8,LEGO® Friends,...,854,6.9,4.99,0.88,2.931458,2.60206,0.449144,-0.083987,0.533131,0.08


Rename the columns

In [13]:
df.columns = ['apple_id','trim_title', 'apple_title', 'genre', 'apple_rating',
       'apple_reviews', 'apple_size', 'pegi',
       'normed_apple_rating', 'google_title', 'google_rating',
       'google_reviews', 'google_size', 'price', 
       'normed_google_rating', 'log_google_reviews', 'log_apple_reviews',
    'z_score_google_rating', 'z_score_apple_rating', 'z_score_google_sub_apple','norm_google_sub_apple']
df.shape

(604, 21)

# Revenues

Each app's price is multiplied by the number of its reviews to estimate its revenue. To evaluate how well an app is received, we will use the new feature "revenue".

In [14]:
df['google_revenue'] = df['google_reviews'] * df['price']
df['apple_revenue'] = df['apple_reviews'] * df['price']

df.loc[df['google_revenue']>0,'log_google_revenue'] = df[df['google_revenue']>0]['google_revenue'].apply(lambda x: math.log(x, 10))
df.loc[df['apple_revenue']>0,'log_apple_revenue'] = df[df['apple_revenue']>0]['apple_revenue'].apply(lambda x: math.log(x, 10))

# Save final dataset

In [15]:
df.to_csv(save_path, index=False)
df.shape

(604, 25)