In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import string
import itertools
from collections import Counter
import seaborn as sns
import papermill as pm

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from tqdm import tqdm

plt.style.use('seaborn')
%matplotlib inline

In [2]:
train = pd.read_csv('../../data/train_groups.csv')
test  = pd.read_csv('../../data/test_groups.csv')


In [24]:
text = []
for i in range(1,28025):
    filename = '../../data/new_content/' + str(i) + '.dat'
    try:
        with open(filename) as f:
            text.append(f.read())
    except:
        text.append("ERROR")

# Текстовые признаки

### text to dict

In [10]:
def tokenize_doc(doc):
    """
        Convert the input document into a list of tokens, discarding all punctuation and lowercasing the tokens
        doc: string

        return list of strings
    """
    # discard all punctuation
    table = str.maketrans({key: None for key in string.punctuation})
#     print(table)
    doc = doc.translate(table)

    # replace all whitespace characters with just space

    # split doc into tokens by space

    # discard empty tokens and lowercase
    tokens = doc.lower().split() 

    return tokens

In [11]:
def build_vocab(docs, min_count=None):
    """
        Build the vocaublary mapping (that is, the correspondance between the token and its numeric id)
        docs: a list of tokenized documents
        min_count (optional): int, discard tokens that appeared less than min_count times

        return dictionary str -> int
    """

    # count all tokens in all documents and filter those that appear less than min_count
    if min_count is not None:
        count = dict(Counter(sum(docs,[])))
        
        count = Counter(dict(filter(lambda x: x[1] >= min_count, count.items())))
        q = sorted(count)
        
        vocab = {q[x]: x for x in range(len(q))}
        
    else:
        count = dict(Counter(sum(docs,[])))
        q = sorted(count)
        vocab = {q[x]: x for x in range(len(q))}
                     
    return vocab


In [12]:
def doc_to_multihot(doc, vocab):
    """
        Convert a document to a multihot representation
        doc: str, a tokenized document
        vocab: dict, vocabulary mapping

        return np.array, shape=(|V|,)
    """

    # create a vector of zeros of the shape (|V|, )
    x = np.zeros(len(vocab.keys()))
    
    # set the corresponding dimensions to 1
    
    for j in doc:
        if j in vocab.keys():
            x[vocab[j]] = 1
    
    

    # set the corresponding dimensions to 1

    return x

In [13]:
def labels_to_onehot(labels):
    """
        Convert the indices to one-hot representation
        labels: np.array of labels, shape=(N,)

        return np.array, shape=(N, k)
       """
    
    n_classes = len(set(labels))
    n_samples = len(labels)

    # create a matrix of zeros of shape (n_samples, n_classes)
    one_hot = np.zeros((n_samples,n_classes))
    # fill one-hot values
    one_hot[np.arange(n_samples),labels ] = 1
    
    return one_hot

In [21]:
tokenize_x = [tokenize_doc(d) for d in text]
x_vocab = build_vocab(tokenize_x, min_count = 10)
docs_toy_multi_hot = [doc_to_multihot(i, x_vocab) for i in tokenize_x]

In [22]:
docs_toy_multi_hot

[array([1., 0., 0., ..., 0., 1., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([1., 1., 0., ..., 1., 1., 1.]),
 array([0., 1., 0., ..., 0., 0., 1.]),
 array([1., 0., 0., ..., 0., 1., 1.]),
 array([1., 1., 0., ..., 0., 0., 0.]),
 array([1., 0., 0., ..., 0., 1., 0.]),
 array([0., 1., 0., ..., 0., 0., 1.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([1., 1., 0., ..., 0., 0., 1.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([1., 1., 0., ..., 0., 0., 0.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([1., 1., 1., ..., 1., 0., 0.]),
 array([1., 0., 0., ..., 0., 0., 0.]),
 array([1., 1., 0., ..., 0., 0., 1.]),
 array([1., 0., 0., ..., 0., 0., 1.])]

In [27]:
np.savetxt('multi_hot.csv', docs_toy_multi_hot)