In [37]:
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix, bmat, vstack, save_npz, csc_matrix, load_npz
from random import sample, shuffle
import numpy as np

import time
import pickle

MAX = 200
TRAINX_P = 'trainX_POSITIVES'
TRAINX_N = 'trainX_NEGATIVES' + str(MAX)

In [38]:
DATA = 'data/'

# Read data
with open(DATA + 'statements', 'r') as f:
    statements = f.read().splitlines()
    stmts_names = [st.split(',')[0][len('fof('):] for st in statements]
    
with open(DATA + 'chronology', 'r') as f:
    chronology = f.read().splitlines()
    
with open(DATA + 'features', 'r') as f:
    features = f.read().splitlines()
    features = {f.split(':')[0] : f.split(':')[1].split(', ') for f in features}
    
with open(DATA + 'dependencies_train', 'r') as f:
    train_dep_lines = f.read().splitlines()
    train_dep = {}
    for f in train_dep_lines:
        stmt = f.split(':')[0]
        prms = f.split(':')[1].split(' ')
        if stmt in train_dep.keys():
            train_dep[stmt] += prms
        else:
            train_dep[stmt] = prms
    
with open(DATA + 'conjectures_test', 'r') as f:
    conj_test = f.read().splitlines()

In [39]:
map_chrono = {f : i for i, f in enumerate(chronology)}

In [40]:
flatten = lambda t: [item for sublist in t for item in sublist]
land = lambda l1, l2: [a and b for a, b in zip(l1, l2)]
lor = lambda l1, l2: [a or b for a, b in zip(l1, l2)]
lxor = lambda l1, l2: [a != b for a, b in zip(l1, l2)]

# Prepare features sparse vectors
fts = list(set(flatten(list(features.values()))))
map_fts = {f : i for i, f in enumerate(fts)}

In [41]:
# Now I will convert statements to vectors
stmts = {st: [int(f in features[st]) for f in fts] for st in stmts_names}

In [42]:
def extract_feature (conj1, conj2):
    return land(conj1, conj2) + lor(conj1, conj2) + lxor(conj1, conj2)

In [46]:
import random
# negative cases
subsections = shuffle(list(train_dep.keys()))
subsections = np.array_split(subsections, 10)
prefix = 'trainX' + str(MAX) + '_'
negatives = csc_matrix((0, len(fts) * 3))
for subi, conjs in enumerate(subsections):
    print("PART ", subi)
    if (len(conjs) == 0):
        break
    sample_negs = []
    amount = 0
    for c, conj in enumerate(conjs):
        negs = []
        if (c % 10 == 0):
            print("conjunction no : ", c ," , named ", conj, " with chrono no ", map_chrono[conj])
        for i, prem in enumerate([pr for pr in chronology[:map_chrono[conj]] 
                                     if pr not in train_dep[conj]]):
            temp = extract_feature(stmts[conj], stmts[prem])
            ones = sum(temp)
            negs.append(temp)
            amount += 1
        if len(negs) > MAX :
            sample_negs += sample(negs, MAX)
        else:
            sample_negs += [n for n in negs]
    sample_negs = csc_matrix(sample_negs)
    negatives = vstack([negatives, sample_negs])
save_npz(TRAINX_N, csc_matrix(negatives))

PART  0
conjunction no :  0  , named  t3_xboole_0  with chrono no  27
conjunction no :  10  , named  t20_xboole_1  with chrono no  63
conjunction no :  20  , named  t59_xboole_1  with chrono no  107
conjunction no :  30  , named  t117_xboole_1  with chrono no  171
conjunction no :  40  , named  t31_zfmisc_1  with chrono no  325
conjunction no :  50  , named  t114_zfmisc_1  with chrono no  400
conjunction no :  60  , named  t32_subset_1  with chrono no  496
conjunction no :  70  , named  t60_setfam_1  with chrono no  624
conjunction no :  80  , named  t126_relat_1  with chrono no  771
conjunction no :  90  , named  t35_funct_1  with chrono no  922
conjunction no :  100  , named  t32_ordinal1  with chrono no  1089
conjunction no :  110  , named  t19_mcart_1  with chrono no  1309
PART  1
conjunction no :  0  , named  t52_mcart_1  with chrono no  1351
conjunction no :  10  , named  t97_mcart_1  with chrono no  1416
conjunction no :  20  , named  t83_funct_2  with chrono no  1556
conjunctio

In [54]:
# positive cases
positives = csc_matrix((0, len(fts) * 3))
for conjs in np.array_split(list(train_dep.keys()), 10):
    start = time.time()
    positives = vstack([positives, csc_matrix([extract_feature(stmts[conj], stmts[premise]) for conj in conjs 
                 for premise in train_dep[conj]])])
    end = time.time()
    print("took: ", end - start)
save_npz(TRAINX_P, csc_matrix(positives))

took:  70.63029766082764
took:  77.22966027259827
took:  65.01791667938232
took:  82.07144784927368
took:  188.5096971988678
took:  121.4633572101593
took:  113.22412776947021
took:  69.0822160243988
took:  14.608438730239868
took:  57.89383339881897


In [None]:
deps = 0
for c in list(train_dep.keys()):
    deps += len(train_dep[c])
deps

In [55]:
# combine whole train set and save it
TRAINX = 'fullTrainX' + str(MAX)
TRAINY = "fullTrainY" + str(MAX)
trainY = [1] * positives.shape[0] + [0] * negatives.shape[0]
trainX = vstack([positives, negatives])
save_npz(TRAINX, trainX)
with open(TRAINY, "wb") as fp:   #Pickling
    pickle.dump(trainY, fp)

In [None]:
TrainY += 

In [None]:
positives

In [None]:
negatives