In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CorpusLoader').master('local[4]').config("spark.executor.memory", "8g").getOrCreate()
import os
from pyspark.sql.functions import split, element_at, explode, map_values, array_min, broadcast, map_from_entries, arrays_zip, array_contains, monotonically_increasing_id, array_distinct, transform, arrays_zip, size, slice, collect_list, first, map_from_arrays
from pyspark.sql.types import LongType, ArrayType, IntegerType, MapType

class CorpusLoader:

    def __init__(self, root_path, spark):
        self.__root_path = root_path
        self.__spark = spark

    def load(self):
        self.__array_df = self.__load_or_create_parquet('array.parquet', self.__create_array_df)
        self.__token_df = self.__load_or_create_parquet('token.parquet', self.__create_token_df)
        self.__contains_df = self.__load_or_create_parquet('contains.parquet', self.__create_contains_df)
        self.__data_df = self.__load_or_create_parquet('data.parquet', self.__create_data_df)

    def __load_or_create_parquet(self, name, create_function):
        parquet_path = os.path.join(os.path.join(self.__root_path, 'parquets'), name)
        
        if not os.path.exists(parquet_path):
            print(f'File "{name}" not found. \n\t -- Creating "{name}" ...')
            
            df = create_function()
            df.write.parquet(parquet_path)

            print('\t -- Done.')

        print(f'Loading "{name}" ...')
        return self.__spark.read.parquet(parquet_path)

    def __create_token_df(self):
        one_gram_path = os.path.join(self.__root_path, '1')

        one_gram_df = spark.read.csv(one_gram_path, sep='\n').withColumnRenamed('_c0', 'Input')
        token_df = one_gram_df \
                .select(split('Input', '\t').alias('SplitInput')) \
                .select(element_at('SplitInput', 1).alias('Tokens')) \
                .select(explode(split('Tokens', ' ')).alias('Token')) \
                .orderBy('Token') \
                .withColumn('TokenId', monotonically_increasing_id()) 
        
        return token_df

    def __create_array_df(self):
        n_gram_directories = [os.path.join(self.__root_path, x) for x in os.listdir(self.__root_path) if x.isdigit()]
        
        input_df = None

        for path in n_gram_directories:
            new_input_df = spark.read.csv(path, sep='\n').withColumnRenamed('_c0', 'Input')
            
            if input_df is None:
                input_df = new_input_df
            else:
                input_df = input_df.union(new_input_df)

        split_df = input_df \
                    .select(split('Input', '\t').alias('SplitInput')) \
                    .select(element_at('SplitInput', 1).alias('Tokens'),
                            slice('SplitInput', 2, size('SplitInput')).alias('Data')) \
                    .select(split('Tokens', ' ').alias('Tokens'), 'Data')

        array_df = split_df.select('Tokens', transform('Data', lambda d: split(d, ',')).alias('Data')) \
                    .select('Tokens', transform('Data', lambda x: x[0].cast(IntegerType())).alias('Years'),
                            transform('Data', lambda x: x[1].cast(LongType())).alias('Frequency'),
                            transform('Data', lambda x: x[2].cast(LongType())).alias('BookFrequency')) \
                    .withColumn('NgramId', monotonically_increasing_id())

        return array_df

    def __create_contains_df(self):
        n_gram_df = self.__array_df

        n_gram_to_token_id_df = n_gram_df.select('NgramId', 'Tokens') \
                .select(explode('Tokens').alias('Token'), 'NgramId') \
                .join(self.__token_df, on='Token') \
                .groupBy('NgramId').agg(collect_list('TokenId').alias('TokenIds'))

        contains_df = n_gram_to_token_id_df.select('NgramId', 'TokenIds') \
            .withColumn('IndexArray', transform('TokenIds', lambda x, i: i)) \
            .select('NgramId', arrays_zip('IndexArray', 'TokenIds').alias('TokenIds')) \
            .select('NgramId', explode('TokenIds').alias('TokenId')) \
            .select('NgramId', 'TokenId.IndexArray', 'TokenId.TokenIds') \
            .withColumnsRenamed({'IndexArray': 'Position', 'TokenIds': 'TokenId'}) \
            .orderBy('NgramId')

        return contains_df

    ## This horrific arrays to list of structs to map construct is required, because map_from_arrays zeroes everything out.
    def __create_data_df(self):
        data_df = self.__array_df.select('NgramId', 'Years', 'Frequency', 'BookFrequency')
        data_df = data_df.withColumn('FrequencyStructs', arrays_zip('Years', 'Frequency'))
        data_df = data_df.withColumn('BookFrequencyStructs', arrays_zip('Years', 'BookFrequency'))
        data_df = data_df.withColumn('FrequencyMap', map_from_entries('FrequencyStructs'))
        data_df = data_df.withColumn('BookFrequencyMap', map_from_entries('BookFrequencyStructs'))
        data_df = data_df.select('NgramId', 'FrequencyMap', 'BookFrequencyMap')

        data_df.printSchema()
        
        return data_df.withColumnsRenamed({'FrequencyMap': 'Frequency', 'BookFrequencyMap': 'BookFrequency'})

In [2]:
cl = CorpusLoader('C:/Users/bincl/BA-Thesis/Dataset/parquets_corpus/', spark)

cl.load()

cl._CorpusLoader__token_df.printSchema()
cl._CorpusLoader__contains_df.printSchema()
cl._CorpusLoader__array_df.printSchema()
cl._CorpusLoader__data_df.printSchema()

Loading "array.parquet" ...
Loading "token.parquet" ...
Loading "contains.parquet" ...
Loading "data.parquet" ...
root
 |-- Token: string (nullable = true)
 |-- TokenId: long (nullable = true)

root
 |-- NgramId: long (nullable = true)
 |-- Position: integer (nullable = true)
 |-- TokenId: long (nullable = true)

root
 |-- Tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Years: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- Frequency: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- BookFrequency: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- NgramId: long (nullable = true)

root
 |-- NgramId: long (nullable = true)
 |-- Frequency: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)
 |-- BookFrequency: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)



In [6]:
tokens_last = cl._CorpusLoader__data_df.where('NgramId == "1391569588591"')
print(tokens_last.first()["Frequency"])
tokens_last.show()


{2016: 7, 2017: 3, 2018: 7, 2019: 2, 1990: 1, 1964: 1, 1997: 1, 1999: 1, 2002: 1, 1845: 1, 2005: 2, 2009: 4, 1978: 1, 2010: 1, 2011: 4, 2012: 3, 2013: 7, 2014: 11, 2015: 7}
+-------------+--------------------+--------------------+
|      NgramId|           Frequency|       BookFrequency|
+-------------+--------------------+--------------------+
|1391569588591|{1845 -> 1, 1964 ...|{1845 -> 1, 1964 ...|
+-------------+--------------------+--------------------+



In [4]:
tokens_last = cl._CorpusLoader__contains_df.where('TokenId == "11890"')
tokens_last.show()


+-----------+--------+-------+
|    NgramId|Position|TokenId|
+-----------+--------+-------+
| 8589935889|       0|  11890|
|34359740934|       1|  11890|
|34359743488|       1|  11890|
|34359743876|       1|  11890|
|34359745277|       1|  11890|
|34359749969|       1|  11890|
|34359750873|       1|  11890|
|34359760437|       1|  11890|
|34359776562|       1|  11890|
|34359779021|       1|  11890|
|34359785558|       1|  11890|
|34359785747|       1|  11890|
|34359790758|       1|  11890|
|34359791432|       1|  11890|
|34359793582|       1|  11890|
|34359797169|       1|  11890|
|34359797826|       1|  11890|
|34359801879|       1|  11890|
|34359806613|       1|  11890|
|34359812129|       1|  11890|
+-----------+--------+-------+
only showing top 20 rows

