In [1]:
import time
from torchmps import MPS
import numpy as np
import pandas as pd
import re
import nltk
from gensim.models import word2vec
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
df1=pd.read_csv("./DataSet/AG-NEWS/train.csv", sep="\t" ,  header=None)
df2=pd.read_csv("./DataSet/AG-NEWS/test.csv", sep="\t" , header=None)

df = pd.concat([df1,df2])
df.head()

Unnamed: 0,0
0,"3,""Wall St. Bears Claw Back Into the Black (Re..."
1,"3,""Carlyle Looks Toward Commercial Aerospace (..."
2,"3,""Oil and Economy Cloud Stocks' Outlook (Reut..."
3,"3,""Iraq Halts Oil Exports from Main Southern P..."
4,"3,""Oil prices soar to all-time record, posing ..."


In [3]:
df = df[0].str.split(pat=',"',expand=True)
del df[1]
del df[3]
del df[4]
del df[5]
del df[6]
df

Unnamed: 0,0,2
0,3,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Reuters - Private investment firm Carlyle Grou...
2,3,Reuters - Soaring crude prices plus worries\ab...
3,3,Reuters - Authorities have halted oil export\f...
4,3,"AFP - Tearaway world oil prices, toppling reco..."
...,...,...
7595,1,Ukrainian presidential candidate Viktor Yushch...
7596,2,With the supply of attractive pitching options...
7597,2,Like Roger Clemens did almost exactly eight ye...
7598,3,SINGAPORE : Doctors in the United States have ...


In [4]:
df[0].value_counts()

1    31900
3    31900
2    31900
4    31900
Name: 0, dtype: int64

In [5]:
# Escoger subconjunto 

sample = 5000

df1 = df[(df[0] == "1") & df[2].notnull()].sample(sample, random_state=1234)
df2 = df[(df[0] == "2") & df[2].notnull()].sample(sample, random_state=1234)
df3 = df[(df[0] == "3") & df[2].notnull()].sample(sample, random_state=1234)
df4 = df[(df[0] == "4") & df[2].notnull()].sample(sample, random_state=1234)


df = pd.concat([df1,df2,df3,df4])

In [6]:
df[0].value_counts()

4    5000
2    5000
1    5000
3    5000
Name: 0, dtype: int64

In [7]:
corpus = df[2].values

In [8]:
df[0] = df[0].astype('category')
cat_columns = df.select_dtypes(['category']).columns
label = df[cat_columns].apply(lambda x: x.cat.codes).values.squeeze()

# Preprocesamiento

In [10]:
# Definimos los tokenizer y lemmatizer
wpt = nltk.WordPunctTokenizer()
lem = nltk.WordNetLemmatizer()
# Descargamos las stopwords para inglés
stop_words = nltk.corpus.stopwords.words('english')
# Definimos la función de preprocesamiento
def normalize_document(doc):
    # Se eliminan caracteres especiales
    doc = re.sub(r'\n', '', doc)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    # Se convierten los téxtos a minúsculas
    doc = doc.lower()
    doc = doc.strip()
    # Tokenizado de documento
    tokens = wpt.tokenize(doc)
    # Eliminación de stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lematización
    tokens = [lem.lemmatize(token) for token in tokens]
    # Retornamos una versión filtrada del texto
    doc = ' '.join(tokens)
    return doc
# Vectorización de la función
normalize_corpus = np.vectorize(normalize_document)

In [11]:
norm_corpus = normalize_corpus(corpus)

# Representacion word2vec

Los siguientes parámetros son utilizados por el modelo Word2vec para construir el modelo:

* feature_size: Determina la dimensión de los vectores de embedding.
* window_context: Es el número de palabras en el vecindario que constituye el contexto.
* min_word_count: Especifica el conteo mínimo de una palabra dentro del corpus para ser incluida dentro del vocabulario.
* sample: este parámetro es usado para el sub-muestreo dentro del algoritmo. Generalmente, los valores entre 0.01 entre 0.0001 funcionan.

In [12]:
feature_size = 100 
window_context = 10
min_word_count = 1 
sample = 1e-3

In [13]:
wpt = nltk.WordPunctTokenizer()

tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

In [14]:
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample, iter=100)

w2v_representations=[]
for i,sentence in enumerate(tokenized_corpus):
    try:
        # La representación de un documento es el promedio de la representación
        # de cada uno de sus términos.
        w2v_representations.append(w2v_model.wv[sentence].mean(axis=0))
    except:
        # Hay algunos casos que sólo contenían stopwords o caracteres especiales
        # Como no tienen representación vectorial asignamos un vector de zeros.
        w2v_representations.append(np.zeros(shape=(feature_size,)))

X = np.array(w2v_representations)

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, label, test_size=0.3, random_state=42, stratify = label)

(14000, 100) (14000,)
(6000, 100) (6000,)


# Transformar a tensor de torch

In [18]:
x_train = torch.Tensor(X_train)
x_test = torch.Tensor(X_test)
y_train = torch.Tensor(Y_train).type(torch.long)
y_test = torch.Tensor(Y_test).type(torch.long)

# Crear conjunto de datos 

In [19]:
train_set = torch.utils.data.TensorDataset(x_train, y_train)
test_set = torch.utils.data.TensorDataset(x_test, y_test)

# Aprendizaje

In [30]:
# Inicializacion
torch.manual_seed(0)

# Parametros de MPS
bond_dim = [8]
rate = [1e-6,1e-7]

for bond in bond_dim:
    
    for learn_rate in rate:
    
        # Parametros de entrenamiento
        num_train  = len(train_set.tensors[0])
        num_test   = len(test_set.tensors[0])
        batch_size = 50
        num_epochs = 100
        l2_reg     = 0.

        # Inicializar el modulo MPS
        mps = MPS(input_dim=100, output_dim=4, bond_dim=bond, parallel_eval=True)


        # Implementar mapeo de caracteristicas
        def feature_map(data):
            return torch.Tensor([1.0, data])

        mps.register_feature_map(feature_map)

        # Establecer funcion de perdida y optimizador
        loss_fun = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(mps.parameters(), lr=learn_rate, weight_decay=l2_reg)

        samplers = {'train': torch.utils.data.SubsetRandomSampler(range(num_train)),
                    'test': torch.utils.data.SubsetRandomSampler(range(num_test))}

        loaders = {name: torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                   sampler=samplers[name], drop_last=True) for (name, dataset) in 
                   [('train', train_set), ('test', test_set)]}

        num_batches = {name: total_num // batch_size for (name, total_num) in
                       [('train', num_train), ('test', num_test)]}

        cfun = []
        cfun_test = []
        err_train = []
        err_test = []

        # Empecemos a entrenar
        for epoch_num in range(1, num_epochs+1):
            running_loss = 0.
            running_acc = 0.

            for inputs, labels in loaders['train']: # obtener Batch
                inputs, labels = inputs.view([batch_size, 100]), labels.data

                # Llamar nuestro MPS para obtener puntaje logit y predicciones
                scores = mps(inputs) # Pasar Batch
                _, preds = torch.max(scores, 1)

                # Calcule la perdida y la precision, Añadalos a los running totales
                loss = loss_fun(scores, labels) # calcular loss
                with torch.no_grad():
                    accuracy = torch.sum(preds == labels).item() / batch_size
                    running_loss += loss
                    running_acc += accuracy

                # Backpropagate y actualizar parametros
                optimizer.zero_grad()
                loss.backward() # Calcular gradientes
                optimizer.step() # Actualizar pesos

            cfun.append((running_loss / num_batches['train']).item())
            err_train.append(running_acc / num_batches['train'])

            # Evaluar precision sobre clasificador MPS sobre el conjunto de prueba
            with torch.no_grad():
                running_loss = 0.
                running_acc = 0.

                for inputs, labels in loaders['test']:
                    inputs, labels = inputs.view([batch_size, 100]), labels.data

                    # Llamar nuestro MPS para obtener puntaje logit y predicciones
                    scores = mps(inputs)
                    _, preds = torch.max(scores, 1)

                    running_loss += loss_fun(scores, labels) # calcular loss
                    running_acc += torch.sum(preds == labels).item() / batch_size

            cfun_test.append((running_loss / num_batches['train']).item())
            err_test.append(running_acc / num_batches['test'])


        with open(f'./Data/TEST-rate{learn_rate}-BOND{bond}AGNEWS.txt', 'w') as f:
            for item in err_test:
                f.write("%s\n" % item)

        with open(f'./Data/TRAIN-rate{learn_rate}-BOND{bond}AGNEWS.txt', 'w') as f:
            for item in err_train:
                f.write("%s\n" % item)

        with open(f'./Data/LOSS-rate{learn_rate}-BOND{bond}AGNEWS.txt', 'w') as f:
            for item in cfun:
                f.write("%s\n" % item)

        with open(f'./Data/LOSS-TEST-rate{learn_rate}-BOND{bond}AGNEWS.txt', 'w') as f:
            for item in cfun_test:
                f.write("%s\n" % item)
                
