In [1]:
from itertools import combinations
import random
import os
from datetime import datetime
import time
from multiprocessing import Pool

In [None]:
dataPath = '/content/'
fileDataPath = []
fileDataNamesOnly = []
for a in os.listdir(dataPath):
    if a.endswith('.dat'):
        fileDataPath.append(dataPath + a)
        fileDataNamesOnly.append(a.split('.')[0])

fileDataPath = sorted(fileDataPath)
fileDataNamesOnly = sorted(fileDataNamesOnly)
fileDataPath

In [None]:
dataLineCounts = dict()
for path in fileDataPath:
    dataLineCounts[path] = sum(1 for line in open(path))
dataLineCounts

{'/content/chess.dat': 3196}

In [None]:
def randomSelect(m,p):
    result = []
    for i in range(m):
        if random.random().__ge__(1-p):
            result.append(i)
    return result

In [None]:
list_m = [100,500,1000,10000]
list_p = [0.5,0.8,0.3]
for m in list_m:
    for p in list_p:
        a = randomSelect(m, p)
        expect = m*p
        selected = len(a)
        pc = (expect- selected)/expect
        print(f'm={m}, p={p}: expected: {expect}, result: {selected}, delta = {pc*100}%')
    print()

m=100, p=0.5: expected: 50.0, result: 57, delta = -14.000000000000002%
m=100, p=0.8: expected: 80.0, result: 71, delta = 11.25%
m=100, p=0.3: expected: 30.0, result: 34, delta = -13.333333333333334%

m=500, p=0.5: expected: 250.0, result: 257, delta = -2.8000000000000003%
m=500, p=0.8: expected: 400.0, result: 403, delta = -0.75%
m=500, p=0.3: expected: 150.0, result: 154, delta = -2.666666666666667%

m=1000, p=0.5: expected: 500.0, result: 494, delta = 1.2%
m=1000, p=0.8: expected: 800.0, result: 805, delta = -0.625%
m=1000, p=0.3: expected: 300.0, result: 280, delta = 6.666666666666667%

m=10000, p=0.5: expected: 5000.0, result: 4954, delta = 0.9199999999999999%
m=10000, p=0.8: expected: 8000.0, result: 8005, delta = -0.0625%
m=10000, p=0.3: expected: 3000.0, result: 3032, delta = -1.0666666666666667%



In [None]:
def generateItemset(basketItem:[int]):
    result = set()
    for i in range(1, 4):
        tmpset = combinations(basketItem,i)
        for j in tmpset: result.add(j)

    i = 0
    while i < len(basketItem):
        result.add((basketItem[i]))
        j = i+1
        while j < len(basketItem):
            result.add((basketItem[i],basketItem[j]))
            j+=1
        i+=1
    return result

In [None]:
def updateItemsetCount(fullItemset: dict, basketItemset):
    for itemset in basketItemset:
        if itemset in fullItemset:
            # id = fullItemset.index(itemset)
            fullItemset[itemset] +=1
        else:
            fullItemset[itemset] = 1

In [None]:
def convertLine(line:str):
    id_str = line.split()
    return list(map(int, id_str))

In [None]:
def processFileLine(line):
    ids = convertLine(line)
    return generateItemset(ids)

In [None]:
def filterResult(itemset:dict, leastSupportCount)-> dict:
    result = dict()
    for key in itemset.keys():
        if itemset[key] < leastSupportCount: break
        result[key] = itemset[key]
    return result

In [None]:

def SimpleSR(filePath, m, p, s, resultLogPath, random, rangeId):

    print("res"+resultLogPath)
    print("file"+filePath)
    print("inside process algo")
    
    numberBasket = m

    leastSupportCount = 0
    intro = ''

    processLineId = []
    if random: 
        processLineId = randomSelect(numberBasket, p)
        leastSupportCount = int(0.9*p*s*numberBasket)
        intro = f'processing {filePath}, numberBasket={numberBasket}, numberSelecting={len(processLineId)}'
        intro += f', leastSupportCount={leastSupportCount}'
    else: 
        processLineId = rangeId
        leastSupportCount = int(s*numberBasket)
        intro = f'processing {filePath}[{processLineId[0]}:{processLineId[-1]}]'
    fullItemset = dict()

    if random: print(intro)

    timeStart = datetime.now()
    currentLine = 0
    with open(filePath, 'r') as file:
        for line in file:
            line = line.strip()
            if (currentLine in processLineId) and (len(line)>0):
                updateItemsetCount(fullItemset, processFileLine(line))

            currentLine += 1
    frequentItemset = dict(sorted(fullItemset.items(),key=lambda item: item[1], reverse=True))
    if random:
        frequentItemset = filterResult(frequentItemset,leastSupportCount)
    timeStop = datetime.now()

    end = f'finish in {str(timeStop - timeStart)}'
    if random: print(end)
    print(resultLogPath)
    with open(resultLogPath,'w+') as file:
        file.write(f'{intro}\n{end}\n')
        for key in frequentItemset.keys():
            #print(f'{key}->{frequentItemset[key]}')
            file.write(f'{key}->{frequentItemset[key]}\n')
    return frequentItemset
    

In [None]:
def createDirectory(path):
    path = path.split(os.sep)
    currentDir = '.'
    for i in path:
        currentDir += os.sep + i
        try:
            os.mkdir(currentDir)
        except:
            # this folder is existed
            continue
dataDir ='/content/'
resultDir = f'{dataDir}-SR/'
createDirectory(resultDir)
fileDataPath = []
for a in os.listdir(dataDir):
    if a.endswith('.dat'): fileDataPath.append(dataDir + a)
fileDataPath = sorted(fileDataPath)

In [None]:
for path in fileDataPath:
  print (path)

/content/chess.dat


In [None]:
dataDir = './data/'
resultDir = f'{dataDir}SR/'
createDirectory(resultDir)
    
for path in fileDataPath:
    m = sum(1 for line in open(path))
    outputPath = path.replace('.dat','.resultv2')\
                            .replace(dataDir,resultDir)
    SimpleSR(path, m, p, s, outputPath, True, [])
print()

res/content/chess.resultv2
file/content/chess.dat
inside process algo
processing /content/chess.dat, numberBasket=3196, numberSelecting=172, leastSupportCount=71
finish in 0:00:00.972927
/content/chess.resultv2



In [None]:
def buildDataForPool(path, m, p, s, outputPath):
    data =[]
    mOVerC = int(m/c)
    maxCounter = c if m%c==0 else c+1

    for counter in range(maxCounter):
        # each item must strictly follow the argument order from
        # processLimitedPassAlgo(filePath, m, p, s, resultLogPath, random, rangeId):
        data.append([path, m, p, s, outputPath+str(counter), False
                        , [x for x in range(mOVerC*counter, min(m, mOVerC*(counter+1) ))]
                     ])
    return data

In [None]:
def parallelProcess(processInput):
    SimpleSR(processInput[0],processInput[1],processInput[2],processInput[3]
                           ,processInput[4],processInput[5],processInput[6])

In [None]:
pool = Pool(5)
c = 100
resultDir = f'{dataDir}SONc{c}s{s}/'
createDirectory(resultDir)

In [None]:
def collectItemsetFromChunks(chunkResultList):
    result = dict()
    for filePath in chunkResultList:
        print("-------")
        print(f'executing collectItemsetFromChunks: {filePath}')
        print("-------")
        with open(filePath,'r') as file:
            # skip 2 info lines
            next(file)
            next(file)

            for line in file:
                key, value = line.split('->')
                value = int(value)
                if key in result: result[key] += int(value)
                else: result[key] = int(value)
    return result

In [None]:
def collectItemsetFromDataset(datasetName, resultDir):
    timeStart = datetime.now()
    print("inside")
    m = dataLineCounts[f'/content/{datasetName}.dat']
    chunkResult = datasetName + '.result.son'
    
    # scan resultDir for related chunk result
    chunkResultList = [resultDir + x for x in os.listdir(resultDir) if chunkResult in x]
    fullItemset = collectItemsetFromChunks(chunkResultList)
    fullItemset = dict(sorted(fullItemset.items(),key=lambda item: item[1], reverse=True))
    timeStop = datetime.now()    
    print(f'finish collect from {chunkResult} in {str(timeStop - timeStart)}')
    print("before wriet")
    with open(f'/content/{datasetName}.final','w+') as file:
        for key in fullItemset.keys():
            if fullItemset[key] < s*m: break
            print(f'{key}->{fullItemset[key]}\n')
            file.write(f'{key}->{fullItemset[key]}\n')
    return fullItemset

In [None]:
%%time
for path in fileDataPath:
    timeStart = datetime.now()
    m = dataLineCounts[path]
    outputPath = path.replace('.dat','.result.son').replace(dataDir,resultDir)
    parallelData = buildDataForPool(path, m, 1/c, s, outputPath)
    pool.map(parallelProcess, parallelData)
    timeStop = datetime.now()
    print(f'finsih processing {path} in {str(timeStop-timeStart)}')

finsih processing /content/chess.dat in 0:00:22.102093
CPU times: user 802 ms, sys: 129 ms, total: 931 ms
Wall time: 22.1 s


In [None]:
%%time
resultDir='/content/'
for filename in fileDataNamesOnly:
    collectItemsetFromDataset(filename,resultDir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11->2078

(11,)->2078

(7, 29, 54)->2078

(27, 29, 48)->2078

(3, 34, 72)->2077

(7, 42, 72)->2077

(17, 42)->2077

(17, 42, 60)->2077

(5, 17, 48)->2077

(25, 62, 74)->2077

(9, 31, 48)->2077

(11, 58)->2077

(54, 56)->2077

(7, 27, 52)->2077

(52, 54, 62)->2077

(36, 44, 64)->2076

(42, 62, 72)->2076

(7, 31, 64)->2076

(17, 42, 58)->2076

(54, 56, 58)->2076

(7, 21, 40)->2075

(7, 38, 60)->2075

(42, 46, 48)->2075

(29, 54, 62)->2075

(25, 56, 74)->2074

(11, 60)->2074

(7, 52, 54)->2074

(42, 64, 72)->2073

(31, 34, 64)->2073

(42, 44, 48)->2073

(11, 58, 60)->2073

(40, 54, 62)->2073

(54, 56, 60)->2073

(7, 29, 38)->2072

(7, 38, 52)->2072

(52, 54, 56)->2072

(27, 48, 52)->2072

(21, 66)->2071

(21, 60, 66)->2071

(17, 42, 52)->2071

(48, 56, 72)->2071

(25, 31, 56)->2071

(9, 25, 44)->2071

(17, 25, 48)->2071

(27, 36, 48)->2071

(21, 58, 66)->2070

(31, 62, 64)->2070

(36, 40, 54)->2070

(5, 48, 72)->2069

(5, 25

In [None]:
p_list = [.01,.02,.1,.2]

for p in p_list:
    s=.5
    resultDir = f'/content/p{p}s{s}/'
    createDirectory(resultDir)
    for path in fileDataPath:
        m = sum(1 for line in open(path))
        outputPath = path.replace('.dat','.resultv2')\
                            .replace(dataDir,resultDir)
        SimpleSR(path, m, p, s, outputPath, True, [])
    print()