In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CorpusLoader').master('local[4]').config('spark.driver.memory', '4g').getOrCreate()

In [2]:
import os
from pyspark.sql.functions import split, element_at, explode, map_values, array_min, broadcast, map_from_entries, arrays_zip, array_contains, monotonically_increasing_id, array_distinct, transform, arrays_zip, size, slice, collect_list, first, map_from_arrays
from pyspark.sql.types import LongType, ArrayType, IntegerType, MapType

class CorpusLoader:

    def __init__(self, root_path, spark):
        self.__root_path = root_path
        self.__spark = spark

    def load(self):
        self.__array_df = self.__load_or_create_parquet('array.parquet', self.__create_array_df)
        self.__token_df = self.__load_or_create_parquet('token.parquet', self.__create_token_df)
        self.__contains_df = self.__load_or_create_parquet('contains.parquet', self.__create_contains_df)
        self.__data_df = self.__load_or_create_parquet('data.parquet', self.__create_data_df)

    def __load_or_create_parquet(self, name, create_function):
        parquet_path = os.path.join(os.path.join(self.__root_path, 'parquets'), name)
        
        if not os.path.exists(parquet_path):
            print(f'File "{name}" not found. \n\t -- Creating "{name}" ...')
            
            df = create_function()
            df.write.parquet(parquet_path)

            print('\t -- Done.')

        print(f'Loading "{name}" ...')
        return self.__spark.read.parquet(parquet_path)

    def __create_token_df(self):
        one_gram_path = os.path.join(self.__root_path, '1')

        one_gram_df = spark.read.csv(one_gram_path, sep='\n').withColumnRenamed('_c0', 'Input')
        token_df = one_gram_df \
                .select(split('Input', '\t').alias('SplitInput')) \
                .select(element_at('SplitInput', 1).alias('Tokens')) \
                .select(explode(split('Tokens', ' ')).alias('Token')) \
                .orderBy('Token') \
                .withColumn('TokenId', monotonically_increasing_id()) 
        
        return token_df

    def __create_array_df(self):
        n_gram_directories = [os.path.join(self.__root_path, x) for x in os.listdir(self.__root_path) if x.isdigit()]
        
        input_df = None

        for path in n_gram_directories:
            new_input_df = spark.read.csv(path, sep='\n').withColumnRenamed('_c0', 'Input')
            
            if input_df is None:
                input_df = new_input_df
            else:
                input_df = input_df.union(new_input_df)

        split_df = input_df \
                    .select(split('Input', '\t').alias('SplitInput')) \
                    .select(element_at('SplitInput', 1).alias('Tokens'),
                            slice('SplitInput', 2, size('SplitInput')).alias('Data')) \
                    .select(split('Tokens', ' ').alias('Tokens'), 'Data')

        array_df = split_df.select('Tokens', transform('Data', lambda d: split(d, ',')).alias('Data')) \
                    .select('Tokens', transform('Data', lambda x: x[0].cast(IntegerType())).alias('Years'),
                            transform('Data', lambda x: x[1].cast(LongType())).alias('Frequency'),
                            transform('Data', lambda x: x[2].cast(LongType())).alias('BookFrequency')) \
                    .withColumn('NgramId', monotonically_increasing_id())

        return array_df

    def __create_contains_df(self):
        n_gram_df = self.__array_df

        n_gram_to_token_id_df = n_gram_df.select('NgramId', 'Tokens') \
                .select(explode('Tokens').alias('Token'), 'NgramId') \
                .join(self.__token_df, on='Token') \
                .groupBy('NgramId').agg(collect_list('TokenId').alias('TokenIds'))

        contains_df = n_gram_to_token_id_df.select('NgramId', 'TokenIds') \
            .withColumn('IndexArray', transform('TokenIds', lambda x, i: i)) \
            .select('NgramId', arrays_zip('IndexArray', 'TokenIds').alias('TokenIds')) \
            .select('NgramId', explode('TokenIds').alias('TokenId')) \
            .select('NgramId', 'TokenId.IndexArray', 'TokenId.TokenIds') \
            .withColumnsRenamed({'IndexArray': 'Position', 'TokenIds': 'TokenId'}) \
            .orderBy('NgramId')

        return contains_df

    ## This horrific arrays to list of structs to map construct is required, because map_from_arrays zeroes everything out.
    def __create_data_df(self):
        data_df = self.__array_df.select('NgramId', 'Years', 'Frequency', 'BookFrequency')
        data_df = data_df.withColumn('FrequencyStructs', arrays_zip('Years', 'Frequency'))
        data_df = data_df.withColumn('BookFrequencyStructs', arrays_zip('Years', 'BookFrequency'))
        data_df = data_df.withColumn('FrequencyMap', map_from_entries('FrequencyStructs'))
        data_df = data_df.withColumn('BookFrequencyMap', map_from_entries('BookFrequencyStructs'))
        data_df = data_df.select('NgramId', 'FrequencyMap', 'BookFrequencyMap')

        data_df.printSchema()
        
        return data_df.withColumnsRenamed({'FrequencyMap': 'Frequency', 'BookFrequencyMap': 'BookFrequency'})

In [3]:
cl = CorpusLoader('C:/Users/bincl/BA-Thesis/Dataset/parquets_corpus/', spark)

cl.load()

Loading "array.parquet" ...
Loading "token.parquet" ...
Loading "contains.parquet" ...
Loading "data.parquet" ...


In [4]:
cl._CorpusLoader__token_df.printSchema()
cl._CorpusLoader__contains_df.printSchema()
cl._CorpusLoader__array_df.printSchema()
cl._CorpusLoader__data_df.printSchema()

root
 |-- Token: string (nullable = true)
 |-- TokenId: long (nullable = true)

root
 |-- NgramId: long (nullable = true)
 |-- Position: integer (nullable = true)
 |-- TokenId: long (nullable = true)

root
 |-- Tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Years: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- Frequency: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- BookFrequency: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- NgramId: long (nullable = true)

root
 |-- NgramId: long (nullable = true)
 |-- Frequency: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)
 |-- BookFrequency: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)



In [5]:
from pyspark.sql.functions import col
result = cl._CorpusLoader__contains_df.orderBy('NgramId', 'Position').groupBy("NgramId").agg(collect_list("TokenId").alias("TokenIds"), size(collect_list("TokenId")).alias("TokenIds_Count"))
#cl._CorpusLoader__token_df.where('`Token` == "Test"').join(cl._CorpusLoader__is_contained_df, on='TokenId').show()

In [6]:
exploded_result = result.select("NgramId", explode("TokenIds").alias("TokenId"))
print(exploded_result.tail(10))

[Row(NgramId=2070174348382, TokenId=4423), Row(NgramId=2070174348382, TokenId=13238), Row(NgramId=2070174348391, TokenId=4495), Row(NgramId=2070174348391, TokenId=11068), Row(NgramId=2070174348395, TokenId=4405), Row(NgramId=2070174348395, TokenId=14313), Row(NgramId=2070174348398, TokenId=4407), Row(NgramId=2070174348398, TokenId=17204), Row(NgramId=2070174348413, TokenId=4475), Row(NgramId=2070174348413, TokenId=19030)]


In [7]:
first_row = result.first()
TokenIds = first_row["TokenIds"]
NgramId = first_row["NgramId"]

In [8]:
data = cl._CorpusLoader__data_df.where('NgramId == ' + str(NgramId)).select('Frequency')
print(data.first()['Frequency'])

{1569: 1, 1587: 2, 1594: 2, 1618: 1, 1659: 17, 1681: 1, 1700: 4, 1705: 1, 1730: 1, 1735: 2, 1741: 2, 1745: 6, 1748: 5, 1749: 2, 1750: 3, 1751: 1, 1753: 1, 1754: 2, 1755: 3, 1756: 9, 1757: 2, 1759: 17, 1760: 12, 1761: 14, 1762: 4, 1763: 10, 1764: 17, 1765: 25, 1766: 16, 1767: 17, 1768: 18, 1769: 14, 1770: 23, 1771: 22, 1772: 46, 1773: 24, 1774: 20, 1775: 25, 1776: 32, 1777: 40, 1778: 40, 1779: 30, 1780: 40, 1781: 65, 1782: 67, 1783: 87, 1784: 74, 1785: 72, 1786: 92, 1787: 59, 1788: 79, 1789: 72, 1790: 81, 1791: 128, 1792: 89, 1793: 135, 1794: 100, 1795: 103, 1796: 100, 1797: 113, 1798: 116, 1799: 122, 1800: 72, 1801: 130, 1802: 110, 1803: 134, 1804: 131, 1805: 135, 1806: 110, 1807: 127, 1808: 112, 1809: 147, 1810: 116, 1811: 94, 1812: 145, 1813: 129, 1814: 99, 1815: 140, 1816: 125, 1817: 197, 1818: 201, 1819: 278, 1820: 231, 1821: 220, 1822: 244, 1823: 242, 1824: 293, 1825: 276, 1826: 326, 1827: 331, 1828: 395, 1829: 438, 1830: 612, 1831: 475, 1832: 436, 1833: 438, 1834: 422, 1835: 485,

In [9]:
tokens1 = cl._CorpusLoader__contains_df.where('TokenId == ' + str(TokenIds[0])).select('NgramId')

In [10]:
from pyspark.sql.functions import col
result = cl._CorpusLoader__token_df.where('Token == "Deutsch"').join(cl._CorpusLoader__contains_df, on='TokenId').select('Token','NgramId','Position')
#result = cl._CorpusLoader__contains_df.join(broadcast(result), on='NgramId')
#result = cl._CorpusLoader__token_df.join(broadcast(result), on='TokenId').orderBy('NgramId', 'Position').groupBy('NgramId').agg(collect_list('Token').alias('Tokens'))
#result = cl._CorpusLoader__data_df.join(broadcast(result), on='NgramId')
#result = result.orderBy(result.Frequency[2019], ascending=False).select('Tokens', result.Frequency[2019])
result.show(10)
NgramId = result.first()["NgramId"]
print(NgramId)

data = cl._CorpusLoader__data_df.where('NgramId == ' + str(NgramId)).select('Frequency')
print(data.first()['Frequency'])

#cl._CorpusLoader__token_df.where('`Token` == "Test"').join(cl._CorpusLoader__is_contained_df, on='TokenId').show()



+-------+-----------+--------+
|  Token|    NgramId|Position|
+-------+-----------+--------+
|Deutsch|       1091|       0|
|Deutsch|34359740375|       1|
|Deutsch|34359768393|       1|
|Deutsch|34359793252|       1|
|Deutsch|34359794132|       1|
|Deutsch|34359795993|       1|
|Deutsch|34359798258|       1|
|Deutsch|34359805846|       1|
|Deutsch|34359817782|       1|
|Deutsch|34359826480|       1|
+-------+-----------+--------+
only showing top 10 rows

1091
{1492: 1, 1521: 1, 1522: 2, 1524: 2, 1527: 5, 1531: 11, 1533: 2, 1535: 3, 1536: 6, 1537: 1, 1538: 2, 1539: 1, 1541: 8, 1543: 22, 1544: 1, 1545: 1, 1546: 62, 1547: 1, 1550: 7, 1551: 5, 1552: 10, 1553: 6, 1554: 3, 1555: 8, 1556: 2, 1557: 2, 1558: 18, 1559: 1, 1560: 12, 1561: 13, 1562: 11, 1563: 6, 1564: 4, 1565: 17, 1566: 13, 1567: 40, 1568: 1, 1569: 132, 1570: 32, 1571: 12, 1572: 97, 1573: 4, 1574: 2, 1575: 9, 1576: 4, 1577: 9, 1578: 9, 1579: 16, 1580: 58, 1581: 48, 1582: 287, 1583: 25, 1584: 5, 1585: 51, 1586: 9, 1587: 28, 1588: 

In [11]:
tokens = ["derben", "Deutsch"]
tokens_last = cl._CorpusLoader__token_df.where('Token == "Deutsch"').join(cl._CorpusLoader__contains_df, on='TokenId').where('Position == 1').select('Token','NgramId','Position')
tokens_last.show()

tokens_first = cl._CorpusLoader__token_df.where('Token == "derben"').join(cl._CorpusLoader__contains_df, on='TokenId').where('Position == 0').select('Token','NgramId','Position')
tokens_first.show()



+-------+-----------+--------+
|  Token|    NgramId|Position|
+-------+-----------+--------+
|Deutsch|34359740375|       1|
|Deutsch|34359768393|       1|
|Deutsch|34359793252|       1|
|Deutsch|34359794132|       1|
|Deutsch|34359795993|       1|
|Deutsch|34359798258|       1|
|Deutsch|34359805846|       1|
|Deutsch|34359817782|       1|
|Deutsch|34359826480|       1|
|Deutsch|34359843844|       1|
|Deutsch|34359870357|       1|
|Deutsch|42949675537|       1|
|Deutsch|42949692148|       1|
|Deutsch|42949699956|       1|
|Deutsch|42949703471|       1|
|Deutsch|42949732191|       1|
|Deutsch|42949753572|       1|
|Deutsch|42949772644|       1|
|Deutsch|42949780459|       1|
|Deutsch|42949791684|       1|
+-------+-----------+--------+
only showing top 20 rows

+------+-----------+--------+
| Token|    NgramId|Position|
+------+-----------+--------+
|derben|25769805499|       0|
|derben|34359738574|       0|
|derben|34359738691|       0|
|derben|34359739055|       0|
|derben|34359739101|

In [12]:
tokens_first = tokens_first.select("NgramId")
tokens_last = tokens_last.select("NgramId")

NgramId = tokens_first.join(tokens_last, on="NgramId")

NgramId.show()

+-----------+
|    NgramId|
+-----------+
|34359740375|
+-----------+



In [13]:
NgramId = NgramId.first()["NgramId"]


In [14]:
data = cl._CorpusLoader__data_df.where('NgramId == ' + str(NgramId)).select('Frequency')
print(data.first()['Frequency'])

{1985: 2, 1922: 2, 1986: 1, 1997: 1, 1872: 1, 1938: 2, 2005: 2, 1944: 1, 2011: 2, 2012: 1, 2013: 1, 1953: 2, 1954: 4, 2019: 1, 1828: 1, 1893: 2, 1957: 1, 1895: 1, 1897: 2, 1963: 2, 1836: 1, 1841: 1, 1906: 3, 1908: 2, 1981: 1, 1854: 1, 1983: 3}


In [15]:
result = cl._CorpusLoader__token_df.where('Token == "Deutsch"').join(cl._CorpusLoader__contains_df, on='TokenId').select('Token','NgramId')

result.show(10)
NgramId = result.first()["NgramId"]
print(NgramId)

data = cl._CorpusLoader__data_df.where('NgramId == ' + str(NgramId)).select('Frequency')
print(data.first()['Frequency'])

+-------+-----------+
|  Token|    NgramId|
+-------+-----------+
|Deutsch|       1091|
|Deutsch|34359740375|
|Deutsch|34359768393|
|Deutsch|34359793252|
|Deutsch|34359794132|
|Deutsch|34359795993|
|Deutsch|34359798258|
|Deutsch|34359805846|
|Deutsch|34359817782|
|Deutsch|34359826480|
+-------+-----------+
only showing top 10 rows

1091
{1492: 1, 1521: 1, 1522: 2, 1524: 2, 1527: 5, 1531: 11, 1533: 2, 1535: 3, 1536: 6, 1537: 1, 1538: 2, 1539: 1, 1541: 8, 1543: 22, 1544: 1, 1545: 1, 1546: 62, 1547: 1, 1550: 7, 1551: 5, 1552: 10, 1553: 6, 1554: 3, 1555: 8, 1556: 2, 1557: 2, 1558: 18, 1559: 1, 1560: 12, 1561: 13, 1562: 11, 1563: 6, 1564: 4, 1565: 17, 1566: 13, 1567: 40, 1568: 1, 1569: 132, 1570: 32, 1571: 12, 1572: 97, 1573: 4, 1574: 2, 1575: 9, 1576: 4, 1577: 9, 1578: 9, 1579: 16, 1580: 58, 1581: 48, 1582: 287, 1583: 25, 1584: 5, 1585: 51, 1586: 9, 1587: 28, 1588: 1, 1589: 11, 1590: 6, 1591: 2, 1592: 5, 1593: 6, 1594: 5, 1595: 15, 1596: 11, 1597: 9, 1598: 10, 1599: 32, 1600: 7, 1601: 36, 

In [16]:
data = cl._CorpusLoader__data_df.where('NgramId == ' + str(NgramId)).select('Frequency')
print(data.first()['Frequency'])

{1492: 1, 1521: 1, 1522: 2, 1524: 2, 1527: 5, 1531: 11, 1533: 2, 1535: 3, 1536: 6, 1537: 1, 1538: 2, 1539: 1, 1541: 8, 1543: 22, 1544: 1, 1545: 1, 1546: 62, 1547: 1, 1550: 7, 1551: 5, 1552: 10, 1553: 6, 1554: 3, 1555: 8, 1556: 2, 1557: 2, 1558: 18, 1559: 1, 1560: 12, 1561: 13, 1562: 11, 1563: 6, 1564: 4, 1565: 17, 1566: 13, 1567: 40, 1568: 1, 1569: 132, 1570: 32, 1571: 12, 1572: 97, 1573: 4, 1574: 2, 1575: 9, 1576: 4, 1577: 9, 1578: 9, 1579: 16, 1580: 58, 1581: 48, 1582: 287, 1583: 25, 1584: 5, 1585: 51, 1586: 9, 1587: 28, 1588: 1, 1589: 11, 1590: 6, 1591: 2, 1592: 5, 1593: 6, 1594: 5, 1595: 15, 1596: 11, 1597: 9, 1598: 10, 1599: 32, 1600: 7, 1601: 36, 1602: 6, 1603: 23, 1604: 33, 1605: 4, 1606: 50, 1607: 85, 1608: 14, 1609: 5, 1610: 22, 1611: 70, 1612: 52, 1613: 16, 1614: 19, 1615: 64, 1616: 7, 1617: 4, 1618: 7, 1619: 13, 1620: 29, 1621: 12, 1622: 3, 1623: 12, 1624: 5, 1625: 8, 1626: 1, 1627: 1, 1628: 6, 1630: 5, 1631: 8, 1632: 3, 1633: 4, 1634: 6, 1635: 2, 1636: 2, 1639: 1, 1640: 4, 