In [None]:
import csv
import io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import utils
import xxhash

from imblearn.over_sampling import SMOTE
from random import sample, shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
TEST_SET = 'albergate'

'''
Ratio defining how much data is reserved for CV testing. Ex: 0.8 is an 80/20 train/test split
Float on interval (0.0, 1.0)
'''
TRAIN_TEST_SPLIT = 0.9

'''
Percentage of cross training data to hold out for testing. This test set will not change across folds,
but will change across trials.
Ex. ratio=0.1 means a 90/10 split where 90% of a cross training dataset is available for training and 10% is held for testing.
'''
CROSS_TRAIN_HOLDOUT_RATIO = 0.1

'''
Percentage of dataset to use for cross training. NOTE: this can not be larger that 1 - CROSS_TRAIN_HOLDOUT_RATIO
Ex. ratio=0.1 means a training set 10% of the ORIGINAL dataset is used for cross training. The training data will never overlap with the holdout data.

'''
CROSS_TRAIN_RATIO = 0.1

'''
Ratio defining how many synthetic minority samples should be created. 1.0 results in a fully balanced set.
Float on interval (0.0, 1.0]
'''
BALANCE_RATIO = 1.0

# Number of cross validation folds. Default: 10
N_FOLDS = 1 

# Number of trials of n fold cv
N_TRIALS = 1 

# Training epochs
N_EPOCHS=10

# Training batch size
BATCH_SIZE = 32

# Dimension of the embedding layer. 
EMBEDDING_DIM = 8

# Metrics to meature training performance
METRICS = ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']

# Folder to store TF callback logs
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir='logs')

# Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch
VERBOSITY = 1

In [None]:
DATASET_PATHS = {
    'albergate': ('data/albergate/albergate_data.txt', 'data/albergate/albergate_labels.txt'),
    'eanci': ('data/eanci/eanci_data.txt', 'data/eanci/eanci_labels.txt'),
    'etour': ('data/etour/etour_data.txt', 'data/etour/etour_labels.txt'),
    'itrust': ('data/itrust/itrust_data.txt', 'data/itrust/itrust_labels.txt'),
    'kepler': ('data/kepler/kepler_data.txt', 'data/kepler/kepler_labels.txt'),
    'modis': ('data/modis/modis_data.txt', 'data/modis/modis_labels.txt'),
    'smos': ('data/smos/smos_data.txt', 'data/smos/smos_labels.txt')
}

In [None]:
def format_output(trial_averages):
    result_averages = {}

    # Stores metric averages across all trials
    for metric, results in trial_averages.items():
        result_averages[metric+'_avg'] = sum(results)/len(results)
    
    return pd.DataFrame(trial_averages), pd.DataFrame(result_averages, index=result_averages.keys())

In [None]:
def plot_f1scores(averages, datasets, phase):    
    f1_scores = []
    
    # Create lists for axes
    datasets = sorted(datasets)
    x_datasets = datasets.copy()
    datasets.reverse()
    y_datasets = datasets.copy()
    
    for cross_train_set in y_datasets:
        temp = []
        for dataset in x_datasets:
            
            if dataset == cross_train_set:
                temp.append(float(format(0, '.2f')))
            else:
                temp.append(float(format(averages[dataset+'|'+cross_train_set], '.2f')))

        f1_scores.append(temp)
        
    f1_scores = np.array(f1_scores)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(f1_scores, cmap='Greys')
    
    # Label axes
    plt.xlabel('Primary Set')
    plt.ylabel('Cross Train Set')
    
    # Set chart title
    ax.set_title(phase.upper()+' '+"F1 Scores")

    # Show all axis ticks
    ax.set_xticks(np.arange(len(x_datasets)))
    ax.set_yticks(np.arange(len(y_datasets)))
    
    # Label axis ticks
    ax.set_xticklabels(x_datasets)
    ax.set_yticklabels(y_datasets)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(y_datasets)):
        for j in range(len(x_datasets)):
            text = ax.text(j, i, f1_scores[i, j],
                           ha="center", va="center", color="r")

    
    fig.tight_layout()
    plt.savefig('trials/'+phase+'_heatmap')

In [None]:
def save_results(fold_results, history):
    # Append current fold results to trial_history dict
    # Metric names are appended with a _(run number) each trial, hence the nested for loop
    for current_metric, results in fold_results.items():
        for metric in history.keys():
            if metric in current_metric:
                # F1Score stores results as a list of lists instead of list of floats
                if metric == 'f1_score':
                    try:
                        total=0
                        for result in results:
                            total+=result[0]
                        
                        history[metric].append(total/len(results))
                    except:
                        history[metric].append(results[0])
                    
                else:
                    try:
                        history[metric].append(sum(results)/len(results))
                    except:
                        history[metric].append(results)

                break
    return history

In [None]:
'''
Load all datasets and associated labels.
'''

DATASETS = {}

for set_name, paths in DATASET_PATHS.items():
    data = []
    labels = []
    
    # Load dataset metadocs
    with open(paths[0], newline='') as datafile:
        data_reader = csv.reader(datafile, delimiter='\n')

        for row in data_reader:
            data.append(row[0])
            
    # Load dataset labels
    with open(paths[1], newline='') as labelfile:
        label_reader = csv.reader(labelfile, delimiter='\n')

        for row in label_reader:
            labels.append(int(row[0]))
            
    DATASETS[set_name] = (data, labels)

In [None]:
'''
All datasets will be concatenated into a single corpus.
We want to track where each set begins and ends in the corpus
in order to build the test and train sets.
'''

index = -1
DATASET_INDICES = {}

for set_name, data in DATASETS.items():
    DATASET_INDICES[set_name] = (index+1, index+len(data[0]))
    print(set_name, DATASET_INDICES[set_name])
    index += len(data[0])

In [None]:
'''
Create the corpus.
'''

num_docs = 0
corpus = []
vocabulary = []

for dataset in DATASETS.keys():
    #print(dataset)
    corpus.extend(DATASETS[dataset][0])
    vocabulary.extend(utils.get_vocabulary(DATASETS[dataset][0]))
    num_docs += len(DATASETS[dataset][0])
    
vocabulary = list(set(vocabulary))
print('Docs processed:', num_docs)
print('Corpus size:', len(corpus))
print('Vocabulary size:', len(vocabulary))

In [None]:
for dataset, indices in DATASET_INDICES.items():
    print(dataset, corpus[indices[0]], corpus[indices[1]])

In [None]:
'''
Create tf-idf vectorizer and encode corpus.
'''

vectorizer = TfidfVectorizer()
# Generate numpy matrix of encoded corpus
ENCODED_DATASETS = vectorizer.fit_transform(corpus)
# Convert from numpy matrix to numpy array so we can iterate over the rows
ENCODED_DATASETS = ENCODED_DATASETS.A 

print('Encoded corpus size:', len(ENCODED_DATASETS))

In [None]:
'''
Sanity check that no encoded metadoc is all 0s.
'''

total = 0

for metadoc in ENCODED_DATASETS:
    if np.count_nonzero(metadoc) == 0:
        total += 1
        
print('Meta docs with all 0s:', total)

In [None]:
'''
Check for words lost in tf-idf encoding.
Reference sklearn documentation for tf-idf specs.
'''

encoded_vocabulary = vectorizer.get_feature_names()

print('Encoded vocabulary size:', len(encoded_vocabulary))
'''print('Terms missing from encoded vocabulary:')

for term in vocabulary:
    if term not in encoded_vocabulary:
        print(term)'''

In [None]:
'''
Check for repeated items in encoded set.
Store duplicates to avoid using duplicates in the test and train sets.
'''

unq, count = np.unique(ENCODED_DATASETS, axis=0, return_counts=True)
repeated_groups = unq[count > 1]
repeated_indices = []

for repeated_group in repeated_groups:
    repeated_idx = np.argwhere(np.all(ENCODED_DATASETS == repeated_group, axis=1))
    #print(repeated_idx.ravel())
    for index in repeated_idx.ravel():
        repeated_indices.append(index)

print('Number of repeated items:', len(repeated_indices))

In [None]:
'''
Collision info
'''

ENCODED_COLLISIONS = {}
row_num,k = 0,0
set_names = list(DATASETS.keys())

for i in range(len(ENCODED_DATASETS)):
    
    # If we are done with current set, move to the next
    if row_num >= len(DATASETS[set_names[k]][0]):
        row_num = 0
        k += 1

    if str(hash(bytes(ENCODED_DATASETS[i].data))) in ENCODED_COLLISIONS.keys():
        ENCODED_COLLISIONS[str(hash(bytes(ENCODED_DATASETS[i].data)))].append((set_names[k], row_num))
    else:
        ENCODED_COLLISIONS[str(hash(bytes(ENCODED_DATASETS[i].data)))] = [(set_names[k], row_num)]
                                                                          
    row_num += 1

In [None]:
totals = dict.fromkeys(set_names, 0)

for coll_list in ENCODED_COLLISIONS.values():
    if len(coll_list) > 1:
        for entry in coll_list:
            totals[entry[0]] += 1
        #print(coll_list)
        
print(totals)

In [None]:
'''
We need to retrieve the label of an encoded metadoc after clustering.
Storing labels in dict using hashed encoded metadocs as keys.
'''

ENCODED_LABELS = {}
row_num,k = 0,0
set_names = list(DATASETS.keys())

for i in range(len(ENCODED_DATASETS)):
    # If the encoded metadoc is a repeated item, skip it
    if i in repeated_indices:
        row_num += 1
        continue
    
    # If we are done with current set, move to the next
    if row_num >= len(DATASETS[set_names[k]][0]):
        row_num = 0
        k += 1
        
    if i == DATASET_INDICES[set_names[k]][0] or i == DATASET_INDICES[set_names[k]][1]:
        print(set_names[k], DATASETS[set_names[k]][1][row_num])
    
    # If we have a hash collision, stop iterating
    if xxhash.xxh32(bytes(ENCODED_DATASETS[i].data)).digest() in ENCODED_LABELS.keys():
        print('Hash collision encountered on key:', xxhash.xxh32(bytes(ENCODED_DATASETS[i].data)).digest())
        #break
         
    
        
    # Store label of encoded metadoc
    ENCODED_DATASETS[i].flags.writeable = False
    ENCODED_LABELS[xxhash.xxh32(bytes(ENCODED_DATASETS[i].data)).digest()] = DATASETS[set_names[k]][1][row_num]
    
    row_num += 1
    
print('Encoded labels stored:', len(ENCODED_LABELS.keys()))

In [None]:
'''
Build train and test set indices.
'''

train_indices = []
test_indices = []

for i in range(DATASET_INDICES[TEST_SET][0], DATASET_INDICES[TEST_SET][1]+1):
    # If the encoded metadoc is a repeated item, skip it
    if i in repeated_indices:
        continue
    test_indices.append(i)
    
for set_name in DATASET_INDICES.keys():
    if set_name == TEST_SET:
        continue
    else:
        for i in range(DATASET_INDICES[set_name][0], DATASET_INDICES[set_name][1]+1):
            # If the encoded metadoc is a repeated item, skip it
            if i in repeated_indices:
                continue
            train_indices.append(i)
        
print('Training data available:', len(train_indices))
print('Test set size:', len(test_indices))
print('Total set size:', len(train_indices) + len(test_indices))

# Convert to numpy arrays for easier slicing
train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

In [None]:
i=0
for index in train_indices:
    if index in test_indices:
        i += 1
print(i)

In [None]:
'''
Calculate the centroid of the test set in order to perform kNN clustering.
'''

test_set_centroid = np.mean(ENCODED_DATASETS[test_indices], axis=0)
test_set_centroid = test_set_centroid.reshape(1, -1)

print('Centroid dimensions:', test_set_centroid.shape)
print('Number of nonzero entries(sanity check):', np.count_nonzero(test_set_centroid))

In [None]:
'''
Use kNN to generate training dataset.
'''

# We need to copy the data since knn returns indices correlated to the training set passed in
training_data = np.copy(ENCODED_DATASETS[train_indices])

knn_extractor = NearestNeighbors(n_neighbors=int(len(training_data) * 0.3))
knn_extractor.fit(training_data)

knn_indices = knn_extractor.kneighbors(test_set_centroid, return_distance=False)

In [None]:
print(len(training_data))

In [None]:
'''
Create train and test set label arrays.
'''

training_labels = []
test_labels = []

for index in np.nditer(knn_indices):
    count += 1
    #print(train_set[index])
    #train_set[index].flags.writeable = False
    training_labels.append(ENCODED_LABELS[xxhash.xxh32(bytes(training_data[index].data)).digest()])
    
for index in np.nditer(test_indices):
    #print(train_set[index])
    #train_set[index].flags.writeable = False
    test_labels.append(ENCODED_LABELS[xxhash.xxh32(bytes(ENCODED_DATASETS[index].data)).digest()])
    
print('Training labels:', len(training_labels))
print('Test labels:', len(test_labels))

training_labels = np.array(training_labels)
test_labels = np.array(test_labels)

In [None]:
temp = []

for index in np.nditer(knn_indices):
    temp.append(index)
    
knn_indices = np.array(temp)
print(knn_indices.shape)

In [None]:
 # Perform SMOTE on loaded dataset
balanced_training_data, balanced_training_labels = SMOTE(sampling_strategy=BALANCE_RATIO).fit_resample(training_data[knn_indices], training_labels)
print('Balanced training set size:', len(balanced_training_data))

In [None]:
skf = StratifiedShuffleSplit(n_splits=N_FOLDS, train_size=TRAIN_TEST_SPLIT, random_state=123)
k=1

#for train, test in skf.split(training_data[knn_indices], training_labels):
for train, test in skf.split(balanced_training_data, balanced_training_labels):

    # This will cause the model to build an index of strings to integers.
    # Per TF: It's important to only use training data when creating vocabulary (using the test set would leak information).
    #vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data[train]))
    #input_dim = len(vectorize_layer.get_vocabulary())

    # Embed vocabulary into embedding_dim dimensions.
    # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
    embedding_layer = tf.keras.layers.Embedding(len(encoded_vocabulary), EMBEDDING_DIM, name='embedding')

    # Define model structure
    model = Sequential([
        #vectorize_layer,
        embedding_layer,
        #Dropout(0.2),
        GlobalAveragePooling1D(),
        #Dropout(0.2),
        Dense(16, activation='relu'),
        #Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Create model
    model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
              metrics=[
                  tf.metrics.BinaryAccuracy(threshold=0.5),
                  tf.keras.metrics.Recall(),
                  tf.keras.metrics.Precision(),
                  F1Score(1, threshold=0.5)
              ]
    )

    print('\n\n*************** FOLD %d ***************' %k)

    print('\n******* TRAIN *******')
    # Train model
    train_results = model.fit(
        balanced_training_data[train],
        balanced_training_labels[train],
        batch_size=BATCH_SIZE, 
        epochs=N_EPOCHS,
        #callbacks=[TENSORBOARD_CALLBACK],
        verbose=VERBOSITY
    )

    print('\n******* VALIDATION *******')
    # Test model with validation data
    validation_results = model.evaluate(
        balanced_training_data[test],
        balanced_training_labels[test],
        #callbacks=[TENSORBOARD_CALLBACK],
        return_dict=True,
        verbose=VERBOSITY
    )

    print('\n******* FILTER TEST *******')
    # Test model with cross train data
    ct_results = model.evaluate(
        ENCODED_DATASETS[test_indices],
        test_labels,
        #callbacks=[TENSORBOARD_CALLBACK],
        return_dict=True,
        verbose=VERBOSITY
    )
    k += 1

In [None]:
RANDOM_CT_SEEDS = set()
RANDOM_CV_SEEDS = set()

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_CT_SEEDS) < (len(DATASETS.keys()) * (len(DATASETS.keys())-1) * N_TRIALS):
    RANDOM_CT_SEEDS.add(np.random.randint(10000))

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_CV_SEEDS) < N_TRIALS:
    RANDOM_CV_SEEDS.add(np.random.randint(1000))

RANDOM_CT_SEEDS = iter(list(RANDOM_CT_SEEDS))
RANDOM_CV_SEEDS = list(RANDOM_CV_SEEDS)

In [None]:
# Use the text vectorization layer to normalize, split, and map strings to integers. 
vectorize_layer = TextVectorization()

train_f1scores = {}
validation_f1scores = {}
ct_f1scores = {}

for primary_set, data_labels in DATASETS.items():
    for cross_train_set, ct_data_labels in DATASETS.items():
        if primary_set == cross_train_set:
            pass
        else:
            print('\n\n*************************', primary_set.upper(), '+', cross_train_set.upper(), '*************************')
            
            # Store metric averages for each trial
            train_averages = dict([(metric,[]) for metric in METRICS])
            validation_averages = dict([(metric,[]) for metric in METRICS])
            ct_averages = dict([(metric,[]) for metric in METRICS])

            # Peform N_TRIALS of N_FOLDS CV
            for i,RANDOM_SEED in enumerate(RANDOM_CV_SEEDS):
                print('\n\n******************** TRIAL %d ********************' %(i+1))
                
                training_data, labels, ct_test_data, ct_test_labels = mix_data(data_labels, ct_data_labels, CROSS_TRAIN_HOLDOUT_RATIO, next(RANDOM_CT_SEEDS))

                # Convert data and labels to numpy arrays for training and testing
                #training_data = np.array(training_data, dtype=object)
                #labels = np.array(labels)
                
                k=1 # Fold counter
                # Store metric averages for each fold of a single trial
                train_history = dict([(metric,[]) for metric in METRICS])
                validation_history = dict([(metric,[]) for metric in METRICS])
                ct_history = dict([(metric,[]) for metric in METRICS])
                
                skf = StratifiedShuffleSplit(n_splits=N_FOLDS, train_size=TRAIN_TEST_SPLIT, random_state=RANDOM_SEED)

                for train, test in skf.split(training_data, labels):

                    # This will cause the model to build an index of strings to integers.
                    # Per TF: It's important to only use training data when creating vocabulary (using the test set would leak information).
                    vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data[train]))
                    input_dim = len(vectorize_layer.get_vocabulary())

                    # Embed vocabulary into embedding_dim dimensions.
                    # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
                    embedding_layer = tf.keras.layers.Embedding(input_dim, EMBEDDING_DIM, name='embedding')

                    # Define model structure
                    model = Sequential([
                        vectorize_layer,
                        embedding_layer,
                        #Dropout(0.2),
                        GlobalAveragePooling1D(),
                        #Dropout(0.2),
                        Dense(16, activation='relu'),
                        Dense(1, activation='sigmoid')
                    ])

                    # Create model
                    model.compile(optimizer='adam',
                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
                              metrics=[
                                  tf.metrics.BinaryAccuracy(threshold=0.5),
                                  tf.keras.metrics.Recall(),
                                  tf.keras.metrics.Precision(),
                                  F1Score(1, threshold=0.5)
                              ]
                    )

                    print('\n\n*************** FOLD %d ***************' %k)

                    print('\n******* TRAIN *******')
                    # Train model
                    train_results = model.fit(
                        training_data[train],
                        labels[train],
                        batch_size=BATCH_SIZE, 
                        epochs=N_EPOCHS,
                        callbacks=[TENSORBOARD_CALLBACK],
                        verbose=VERBOSITY
                    )

                    print('\n******* VALIDATION *******')
                    # Test model with validation data
                    validation_results = model.evaluate(
                        training_data[test],
                        labels[test],
                        callbacks=[TENSORBOARD_CALLBACK],
                        return_dict=True,
                        verbose=VERBOSITY
                    )

                    print('\n******* CT TEST *******')
                    # Test model with cross train data
                    ct_results = model.evaluate(
                        ct_test_data,
                        ct_test_labels,
                        callbacks=[TENSORBOARD_CALLBACK],
                        return_dict=True,
                        verbose=VERBOSITY
                    )
                    
                    train_history = save_results(train_results.history, train_history)
                    validation_history = save_results(validation_results, validation_history)
                    ct_history = save_results(ct_results, ct_history)

                    # If we are in the last fold of the trial, average the metric results 
                    # across all n folds and append to trial_averages
                    if k == N_FOLDS:
                        for metric, results in train_history.items():
                            train_averages[metric].append(sum(results)/len(results))
                        for metric, results in validation_history.items():
                            validation_averages[metric].append(sum(results)/len(results))
                        for metric, results in ct_history.items():
                            ct_averages[metric].append(sum(results)/len(results))

                    k += 1

            RESULTS_FILE = 'trials/'+primary_set+'_'+str(CROSS_TRAIN_RATIO)+'_'+cross_train_set+'_'+str(N_TRIALS)+'_T_'+ str(N_FOLDS)+'_fCV.xlsx'
            
            '''
            Write all results to an excel file.
            The first sheet shows metric averages for each trial. The second sheet contains the averages across all trials.
            '''
            trial_table, averages_table = format_output(train_averages)
            
            with pd.ExcelWriter(RESULTS_FILE) as writer:
                trial_table.to_excel(writer, sheet_name='Training Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='Training Averages', header=False)
                
            trial_table, averages_table = format_output(validation_averages)
            
            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                trial_table.to_excel(writer, sheet_name='Validation Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='Validation Averages', header=False)
                
            trial_table, averages_table = format_output(ct_averages)
            
            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                trial_table.to_excel(writer, sheet_name='CT Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='CT Averages', header=False)
                
            train_f1scores[primary_set+'|'+cross_train_set] = sum(train_averages['f1_score'])/len(train_averages['f1_score'])
            validation_f1scores[primary_set+'|'+cross_train_set] = sum(validation_averages['f1_score'])/len(validation_averages['f1_score'])
            ct_f1scores[primary_set+'|'+cross_train_set] = sum(ct_averages['f1_score'])/len(ct_averages['f1_score'])

In [None]:
plot_f1scores(train_f1scores, list(DATASETS.keys()), 'training')
plot_f1scores(validation_f1scores, list(DATASETS.keys()), 'validation')
plot_f1scores(ct_f1scores, list(DATASETS.keys()), 'cross training')

In [None]:
# Display model information
model.summary()

In [None]:
# Retrieve the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
OUTPUT_VECTOR_FILE = 'data/smos/smos_porter_balanced_vectors.tsv'
OUTPUT_METADATA_FILE = 'data/smos/smos_porter_balanced_metadata.tsv'

In [None]:
# Save embeddings to disk
out_vec = io.open(OUTPUT_VECTOR_FILE, 'w', encoding='utf-8')
out_meta = io.open(OUTPUT_METADATA_FILE, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
    out_meta.write(word + '\n')
    
out_vec.close()
out_meta.close()

In [None]:
''' OLD VERSION '''
ENCODED_LABELS = {}
num_docs = 0
collision = False

for name, metadocs in ENCODED_DATASETS.items():
    # hash each metadoc we are using, and store the label associated with the hash
    # will use to retrieve labels after clustering
    print(name, metadocs.shape)
    for i,metadoc in enumerate(metadocs):
        num_docs += 1
        metadoc.flags.writeable = False
        if str(hash(bytes(metadoc.data))) in ENCODED_LABELS.keys():
            print('hash collision')
            collision = True
            break
        ENCODED_LABELS[str(hash(bytes(metadoc.data)))] = DATASETS[name][1][i]
        #print(DATASETS[name][1][i])
        #break
    if collision:
        print('exiting outer loop')
        break
print(num_docs, len(ENCODED_LABELS.keys()))