In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
from zipfile import ZipFile
import gzip
from multiprocessing import Process, Queue

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
class DatasetLoader:
    def __init__(self, file, csv, index_col=False, file_type='zip', lines=-1, max_blocks=-1):
        self.file = file
        self.csv = csv
        self.index_col = index_col
        self.file_type = file_type
        self.lines = lines
        self.max_blocks = max_blocks

    def decompress_data(self):
        prefix = 'drive/MyDrive/data/'
        block = 0

        if self.lines == -1:
            if type == 'zip':
                with ZipFile(f'{prefix}{file}.zip') as archive:
                    file = archive.open(f'{self.csv}.csv')
                    df = pd.read_csv(file, index_col=self.index_col)
                    file.close()
                return df
            elif type == 'gz':
                with gzip.open(f'{prefix}{file}.csv.gz') as archive:
                    df = pd.read_csv(archive, index_col=self.index_col)
                return df
            else:
                return None
        else:
            if type == 'gz':
                self.archive = gzip.GzipFile(f'{prefix}{file}.csv.gz', mode='rb')

                line_str = self.archive.readline().decode("utf-8").replace('\n', '')
                self.headers = np.array(line_str.split(','), dtype=np.str_)

                self.df_data = np.empty(shape=(self.lines, self.headers.size), dtype='O')
                eof = False
                block = 0
                while eof == False:
                    q = Queue()
                    p = Process(target=self.process_fragment)
                    p.start()
                    p.join()
                    fragment = q.get()
                    if fragment.empty:
                        eof = True
                    else:
                        yield fragment
                    print('closed')
                    p.close()
                    block += 1
                    if block == self.max_blocks:
                        break

            elif type == 'test_gz':
                archive = gzip.GzipFile(f'{prefix}{file}.csv.gz', mode='rb')
                line_str = archive.readline().decode("utf-8").replace('\n', '')
                headers = np.array(line_str.split(','))
                print(headers)

                line_str = archive.readline().decode("utf-8").replace('\n', '')
                first_line = np.array(line_str.split(','))
                print(first_line)

                return None
            else:
                return None


    def process_fragment(self, q):
        for i in range(0, self.lines):
            line_str = self.archive.readline().decode("utf-8").replace('\n', '')

            if line_str == '':
                if i == 0:
                    q.put(pd.DataFrame())
                break
            else:
                input_data = np.array(line_str.split(','), dtype=np.str_)

                if input_data.size != self.headers.size:
                    input_data = np.pad(input_data, (self.headers.size - input_data.size, 0), 'constant', constant_values='0')

                self.df_data[i] = input_data

        q.put(pd.DataFrame(self.df_data, columns=self.headers))


prefix = 'drive/MyDrive/data/'

zips = ['google_play', 'google_play_2016', 'steam']
csvs = [
    [
        'Google-Playstore'
    ],
    [
        'latest-with-added-date'
    ],
    [
        'steam',
        'steam_description_data',
        'steam_media_data',
        'steam_requirements_data',
        'steam_support_info',
        'steamspy_tag_data'
    ]
]

In [None]:
dl1 = DatasetLoader(zips[0],csvs[0][0])
gp21_df = dl1.decompress_data()

In [11]:
#a = np.array(['2016-04-16 10:55:45.256251', '2019-06-26 21:26:54.942325', '2021-03-08 12:40:49.948616', '2017-12-18 03:27:30.786389', '2019-05-14 00:31:01.853697'])
np.datetime64('2016-04-16 10:55:45.256251')

numpy.datetime64('2016-04-16T10:55:45.256251')

In [19]:
dl2 = DatasetLoader(zips[1],csvs[1][0],file_type='gz',lines=1000,max_blocks=2)
columns = ['dex_date','pkg_name','vt_detection','vt_scan_date','added']
gp16_df = pd.DataFrame()

for block in dl2.decompress_data():
    block = block.query('markets == "play.google.com"')
    block['added'] = block['added'].astype('M')
    block = block.query('added < 2017-01-01')

    if gp16_df.empty:
        gp16_df = block
    else:
        gp16_df = pd.concat(gp16_df, block)


gp16_df.head()

In [None]:
gp16_df[gp16_df['markets'] == 'play.google.com']

Unnamed: 0,sha256,sha1,md5,dex_date,apk_size,pkg_name,vercode,vt_detection,vt_scan_date,dex_size,added,markets
2,000001A94F46A0C3DDA514E1F24E675648835BBA5EF3C3...,C0444D784685EFE5F6D9F28683B24B5873E509CB,EC82771AE018B93AD784A1FD2B625216,1980-01-01 00:00:00,52469861,"""com.firstchoice.myfirstchoice""",1206145,0,2021-03-17 08:02:21,9201656,2021-03-08 12:40:49.948616,play.google.com
3,000002B63FAD4B030787F6DE4081DC1E12325026EB7DDA...,DD723B32EDD9F70AADBD66846621967157DF9BD4,985E601C17F0A9346590AE92A5AD664E,1980-01-01 00:00:00,4300370,"""com.deperu.sitiosarequipa""",10000,0,2017-12-03 06:50:28,4211104,2017-12-18 03:27:30.786389,play.google.com
4,000003D3981DC548A772A30D688F424CFB88561A63A2DD...,6AE9F138F7E0C63E5D58CC7E82FB05A50F041637,F4789023733E41EE883208ACBC956020,1980-01-01 00:00:00,12958838,"""com.safetravels.safetravelsmain""",400125,0,2019-06-25 08:55:42,7813520,2019-05-14 00:31:01.853697,play.google.com
6,0000049D8911607971A3336DE5CF36F4799D679D6BB9EF...,433307FF815B8BA4A88BC3EE3EC2C912FFBA063B,B03BEEAFB1975881F11395C4F5F6E2ED,1980-01-01 00:00:00,3161615,"""com.bmi.calculatorplus""",11,0,2020-07-13 07:37:27,4000432,2020-06-06 22:01:37.765867,play.google.com
8,00000989F3E215BA9FC3BDD5B56AF751343B540C1026BF...,81EA59FE2C95EDEF3662CAF165FF36AB410FA158,71FFF1BA55D7F6BF0AB00C15F6B5BC99,2015-09-02 10:34:48,1375862,"""kr.ac.snjc.library""",3,0,2016-03-17 04:48:32,2277844,2016-03-01 17:48:50.244774,play.google.com
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0000413BB21963A5D258B970B6407CB8423BB8E9EE37C5...,AC9EF4FB0EC8AB3A83A0A56B5065FD3DE7ADA476,526F41A894032D5BC098A5E2F01696FF,1981-01-01 01:01:02,15169419,"""com.ums.ticketing.iso""",50,0,2021-10-25 06:10:54,6369704,2021-09-25 01:58:07.489149,play.google.com
96,0000417A3100D52547A8B423D6540763BFDA97A7140ACA...,31CD7A8034C690EC341359CE5862FC7BD1623CA8,D9A416290F167D97E8089B9C8CD720D2,1980-01-01 00:00:00,18122860,"""com.teammt.gmanrainy.themestore""",175,0,2020-04-08 14:11:46,8322544,2020-04-16 03:47:37.793468,play.google.com
97,00004295316D5993B4607D230954AB2A5E3CCA611F7B42...,59EDBD3827C183337E05B1C6F80B99DC83A52C1A,1393B436F070825C97DD1517B6F8972F,1980-01-01 00:00:00,3020719,"""com.smallappdeveloper.uklocalnews""",7,0,2019-06-27 19:52:15,5178000,2019-05-27 12:38:47.675446,play.google.com
98,000042D8EC830FD5F2DAF1C4D6B35E302395A85F3F1A63...,69E8233C89F6F0E5AC9F0DAE7585C648F54F52E0,CF2B903206826BBBFA4D3595D8DA6BED,2018-08-10 11:10:52,56955211,"""com.kbcard.kat.liivmate""",14401,0,2019-11-11 04:06:49,12230364,2018-08-14 02:06:52.638856,play.google.com


In [None]:
gp21_df.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10+,10.0,15,True,0.0,...,https://beniyizibyose.tk/#/,jean21101999@gmail.com,"Feb 26, 2020","Feb 26, 2020",Everyone,https://beniyizibyose.tk/projects/,False,False,False,2021-06-15 20:19:35
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,...,https://webserveis.netlify.app/,webserveis@gmail.com,"May 21, 2020","May 06, 2021",Everyone,https://dev4phones.wordpress.com/licencia-de-uso/,True,False,False,2021-06-15 20:19:35
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0.0,50+,50.0,58,True,0.0,...,,vnacrewit@gmail.com,"Aug 9, 2019","Aug 19, 2019",Everyone,https://www.vietnamairlines.com/vn/en/terms-an...,False,False,False,2021-06-15 20:19:35
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5.0,10+,10.0,19,True,0.0,...,http://www.climatesmarttech.com/,climatesmarttech2@gmail.com,"Sep 10, 2018","Oct 13, 2018",Everyone,,True,False,False,2021-06-15 20:19:35
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100+,100.0,478,True,0.0,...,http://www.horodyski.com.pl,rmilekhorodyski@gmail.com,"Feb 21, 2020","Nov 12, 2018",Everyone,http://www.horodyski.com.pl,False,False,False,2021-06-15 20:19:35


In [None]:
gp21_df['Category'].value_counts()

In [None]:
gaming_categories = [
    'Arcade',
    'Puzzle',
    'Casual',
    'Action',
    'Simulation',
    'Adventure',
    'Board',
    'Racing',
    'Role Playing',
    'Strategy',
    'Card'
]

In [None]:
# Include only games
gp21_df = gp21_df[gp21_df['Category'].isin(gaming_categories)]

In [None]:
gp18_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [None]:
gp18_df['Category'].value_counts()

In [None]:
gp18_df['Installs'].value_counts()

1,000,000+        1579
10,000,000+       1252
100,000+          1169
10,000+           1054
1,000+             907
5,000,000+         752
100+               719
500,000+           539
50,000+            479
5,000+             477
100,000,000+       409
10+                386
500+               330
50,000,000+        289
50+                205
5+                  82
500,000,000+        72
1+                  67
1,000,000,000+      58
0+                  14
0                    1
Free                 1
Name: Installs, dtype: int64

In [None]:
steam_df = decompress_data(zips[2],csvs[2][0])
steam_df.set_index('appid', inplace=True)

In [None]:
steam_lost_df = pd.read_csv(f'{prefix}steam_lost.csv', encoding='unicode_escape')
steam_lost_df.set_index('AppID', inplace=True)

In [None]:
steam_df.head()

Unnamed: 0_level_0,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [None]:
steam_lost_df.head()

Unnamed: 0_level_0,Owners,Name,Type,Changed,Kinguin,Achievements
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
43110,77.48%,Metro 2033,Purchase disabled,03/2021,15.89,48
35420,71.18%,Killing Floor Mod: Defence Alliance 2,Purchase disabled,09/2022,,30
310560,68.04%,DiRT Rally,Purchase disabled,11/2022,,170
43160,67.58%,Metro: Last Light Complete Edition,Delisted,05/2023,9.07,70
863550,67.35%,HITMAN? 2,Purchase disabled,01/2023,,146


In [None]:
steam_df.join(steam_lost_df, how='right')

Unnamed: 0_level_0,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,...,average_playtime,median_playtime,owners,price,Owners,Name,Type,Changed,Kinguin,Achievements
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43110,,,,,,,,,,,...,,,,,77.48%,Metro 2033,Purchase disabled,03/2021,15.89,48
35420,,,,,,,,,,,...,,,,,71.18%,Killing Floor Mod: Defence Alliance 2,Purchase disabled,09/2022,,30
310560,DiRT Rally,2015-12-07,1.0,Codemasters Racing Studio;Feral Interactive (L...,Codemasters;Feral Interactive (Linux);Feral In...,windows;mac;linux,0.0,Single-player;Multi-player;Steam Achievements;...,Racing;Simulation;Sports,Racing;Simulation;Driving,...,488.0,251.0,1000000-2000000,24.99,68.04%,DiRT Rally,Purchase disabled,11/2022,,170
43160,,,,,,,,,,,...,,,,,67.58%,Metro: Last Light Complete Edition,Delisted,05/2023,9.07,70
863550,HITMAN™ 2,2018-11-13,1.0,IO Interactive A/S,Warner Bros Interactive Entertainment,windows,0.0,Single-player;Online Multi-Player;Online Co-op...,Action,Stealth;Action;Assassin,...,636.0,514.0,5000000-10000000,44.99,67.35%,HITMAN? 2,Purchase disabled,01/2023,,146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297550,,,,,,,,,,,...,,,,,0.01%,F1 23 Beta,Test app,01/2023,,?
2299850,,,,,,,,,,,...,,,,,0.01%,ERA-????,Unreleased,02/2023,,?
2351270,,,,,,,,,,,...,,,,,0.01%,Team Disorder Hodon UD,Purchase disabled,06/2023,,?
2362110,,,,,,,,,,,...,,,,,0.01%,Abnormal puzzles,Unreleased,04/2023,,?


In [None]:
no_of_steam_games = steam_df.shape[0]
no_of_lost_steam_games = steam_lost_df.shape[0]

funnel_data = dict(
    number=[no_of_steam_games, no_of_lost_steam_games],
    stage=['Current Steam Games', 'Lost Steam Games']
)
fig = px.funnel(funnel_data, x='number', y='stage')
fig.show()