In [14]:
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix, bmat, vstack, save_npz, csc_matrix, load_npz
from random import sample, shuffle
import numpy as np

import time
import pickle

MAX = 500
TRAINX_P = 'trainX_POSITIVES' + str(MAX)
TRAINX_N = 'trainX_NEGATIVES' + str(MAX)

In [15]:
DATA = 'data/'

# Read data
with open(DATA + 'statements', 'r') as f:
    statements = f.read().splitlines()
    stmts_names = [st.split(',')[0][len('fof('):] for st in statements]
    
with open(DATA + 'chronology', 'r') as f:
    chronology = f.read().splitlines()
    
with open(DATA + 'features', 'r') as f:
    features = f.read().splitlines()
    features = {f.split(':')[0] : f.split(':')[1].split(', ') for f in features}
    
with open(DATA + 'dependencies_train', 'r') as f:
    train_dep_lines = f.read().splitlines()
    train_dep = {}
    for f in train_dep_lines:
        stmt = f.split(':')[0]
        prms = f.split(':')[1].split(' ')
        if stmt in train_dep.keys():
            train_dep[stmt] += prms
        else:
            train_dep[stmt] = prms
    for k, v in train_dep.items():
        train_dep[k] = list(set(v))
    
with open(DATA + 'conjectures_test', 'r') as f:
    conj_test = f.read().splitlines()

In [16]:
map_chrono = {f : i for i, f in enumerate(chronology)}

In [17]:
flatten = lambda t: [item for sublist in t for item in sublist]
land = lambda l1, l2: [a and b for a, b in zip(l1, l2)]
lor = lambda l1, l2: [a or b for a, b in zip(l1, l2)]
lxor = lambda l1, l2: [a != b for a, b in zip(l1, l2)]

# Prepare features sparse vectors
fts = list(set(flatten(list(features.values()))))
map_fts = {f : i for i, f in enumerate(fts)}

In [None]:
# Now I will convert statements to vectors
stmts = {st: [int(f in features[st]) for f in fts] for st in stmts_names}

In [None]:
def extract_feature (conj1, conj2):
    return land(conj1, conj2) + lor(conj1, conj2) + lxor(conj1, conj2)

In [13]:
# negative cases
all_deps = [k for k in list(train_dep.keys())]
shuffle(all_deps)
subsections = np.array_split(all_deps, 10)
prefix = 'trainX' + str(MAX) + '_'
negatives = csc_matrix((0, len(fts) * 3))
for subi, conjs in enumerate(subsections):
    print("PART ", subi)
    if (len(conjs) == 0):
        break
    sample_negs = []
    amount = 0
    for c, conj in enumerate(conjs):
        negs = []
        if (c % 10 == 0):
            print("conjunction no : ", c ," , named ", conj, " with chrono no ", map_chrono[conj])
        for i, prem in enumerate([pr for pr in chronology[:map_chrono[conj]] 
                                     if pr not in train_dep[conj]]):
            temp = extract_feature(stmts[conj], stmts[prem])
            ones = sum(temp)
            negs.append(temp)
            amount += 1
        if len(negs) > MAX :
            sample_negs += sample(negs, MAX)
        else:
            sample_negs += [n for n in negs]
    sample_negs = csc_matrix(sample_negs)
    negatives = vstack([negatives, sample_negs])
save_npz(TRAINX_N, csc_matrix(negatives))

TypeError: object of type 'NoneType' has no len()

In [None]:
# positive cases

positives = csc_matrix((0, len(fts) * 3))
for conjs in np.array_split(list(train_dep.keys()), 10):
    start = time.time()
    positives = vstack([positives, csc_matrix([extract_feature(stmts[conj], stmts[premise]) for conj in conjs 
                 for premise in train_dep[conj]])])
    end = time.time()
    print("took: ", end - start)
save_npz(TRAINX_P, csc_matrix(positives))

In [None]:
# combine whole train set and save it
TRAINX = 'fullTrainX' + str(MAX) + "_61910"
TRAINY = "fullTrainY" + str(MAX) + "_61910"
trainY = [1] * positives.shape[0] + [0] * negatives.shape[0]
trainX = vstack([positives, negatives])
save_npz(TRAINX, trainX)
with open(TRAINY, "wb") as fp:   #Pickling
    pickle.dump(trainY, fp)