In [17]:
import sys
sys.path.append('/Users/bjg/r_html/utils/')
from appstore_dom_selectors import *
from bs4 import BeautifulSoup
import os
import pyreadr
import csv
from pathlib import Path

## Script Configuration
Where is your directory with .rds files? That input goes to the variable `rds_directory_full_path`

Where would you like the .tsv output of this program to be? `output_filename`

How many files would you like to parse before writing to disk? `dump_threshold`

When testing, enter a `stop_number` to limit number of batch iterations. A positive stop number will limit iterations, while a negative stop number will parse a folder's entire contents infinitely. 

In [18]:
# The directory where your .rds files are stored
rds_directory_full_path = Path('/Users/bjg/r_html/big_rds_input/appdescription')
###WINDOWS PATH UNTESTED
#rds_directory_full_path = WindowsPath('/Users/bjg/r_html/big_rds_input/appdescription')

# The desired output directory. Relative path.
output_filename = Path('../output/final_output.tsv')
####WINDOWS PATH UNTESTED
#output_filename = WindowsPath('output/_1st_big_run_appstore_data.tsv')

#Alter this variable to change the batch size
dump_threshold = 25

# Alter this variable to increase or decrease the sample size for testing.
# If you do not want to test, and want to run on a full data-set,
# Comment-out lines 102-106
stop_number = 4

In [19]:
############################################################################################
### File-Configurations
output_filenum = 0
tsv_outfile = open(output_filename, 'w')

## Define a function to convert .rds file to HTML

In [20]:
## Open RDS File Convert to HTML
def rds_to_html(file):

    f_name = os.path.join(rds_directory_full_path, file)
    app_rds = pyreadr.read_r(f_name)
    app_df = app_rds[None]
    app_html = app_df.iloc[0, 0]

    return app_html

## This is for performance: Only need to grab the header once, so we can exclude a conditional in the main loop. 

This loop is the overall structure of the script, but it iterates once. Instead of returning a dataset, it returns the header. 

This code opens a directory -> Goes through all the files -> Converts them to HTML -> Creates a BeautifulSoup object for data extraction -> runs custom `get_app_data` method that returns an entry for one app. 

In [21]:
# Obtain the Field Names for TSV file
# Just gets the first file
fieldnames = []
for root, dirs, files in os.walk(rds_directory_full_path):
    for file in files:
        f_name = os.path.join(rds_directory_full_path, file)
        app_rds = pyreadr.read_r(f_name)
        app_df = app_rds[None]
        app_html = app_df.iloc[0, 0]
        app_soup = BeautifulSoup(app_html, 'lxml')

        #Get the field names
        fieldnames = list(get_app_data(app_soup, f_name).keys())

        ## Add the app identifier as a field
        fieldnames.insert(0, 'app_id')

        break
    break

# Create the TSV file
tsv_chunk_writer = csv.DictWriter(tsv_outfile, fieldnames=fieldnames, delimiter='\t')
tsv_chunk_writer.writeheader()

## Main Loop

Note that this is the same code above. But instead of outputting to a `.tsv`, dictionaries of app-data are stored in an array: `apps = []`. Once `apps` stores the variable batch size (`dump_threshold`), all parsed apps are written to disk. Note the `tsv_chunk_writer.write_row(app)` method.  

The exception is legacy, and may not even occur with the updated code, where I essentially created a schema in the other DOM file. 

In [22]:
## Allocate list to hold all apps' data
##
apps = []

#Parse the data
iterations = 1

for root, dirs, files in os.walk(rds_directory_full_path):

    for file in files:

        if '.rds' not in file:
            print(file)
            continue

        try:
            app_identifier = file[1:]

            app_html = rds_to_html(file)

            app_soup = BeautifulSoup(app_html, 'lxml')

            # Assign the App Identifier
            app_data_dict = get_app_data(app_soup, f_name)
            app_data_dict['app_id'] = app_identifier
            apps.append(app_data_dict)
        except:
            continue


        ## File Writing Portion
        ##
        ## When we have accumulated a lot of files, Append them to the TSV.
        if len(apps) >= dump_threshold:

            print(f'\nProcessed Batch {iterations}, writing to file ...')

            #Do Dump to TSV
            for app in apps:



                # Need to create new file with updated headers
                try:
                    tsv_chunk_writer.writerow(app)


                # Need to create new file with updated headers
                except:
                    print(f'HEADER MISMATCH')
                    tsv_outfile.close()
                    output_filenum += 1
                    tsv_outfile = open(output_filename + '_' + str(output_filenum) + '.tsv', 'w')

                    fieldnames = list(app.keys())

                    ## Add app identifier to fields
                    fieldnames.insert(0, 'app_id')

                    tsv_chunk_writer = csv.DictWriter(tsv_outfile, fieldnames=fieldnames, delimiter='\t')
                    tsv_chunk_writer.writeheader()
                    tsv_chunk_writer.writerow(app)


            print(f'\n{dump_threshold*iterations} apps written in total.')
            iterations += 1



            #reset apps_read
            del apps[:]

            #print(f'Decrementing stop number')
            stop_number -= 1

            if stop_number == 0:
                sys.exit('Read in enough files')


Processed Batch 1, writing to file ...

25 apps written in total.

Processed Batch 2, writing to file ...

50 apps written in total.

Processed Batch 3, writing to file ...

75 apps written in total.

Processed Batch 4, writing to file ...

100 apps written in total.


SystemExit: Read in enough files

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


After the loop is done reading files, there may still be apps in the array. Write them to disk:

In [23]:
## Dump the remaining files if the threshold was not met exactly on the last iteration
for app in apps:
    tsv_chunk_writer.writerow(app)

tsv_outfile.close()