# Clasificacion de Texto con MPS

In [1]:
import time
from torchmps import MPS
import numpy as np
import pandas as pd
import re
import nltk
from gensim.models import word2vec
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

# Carga de datos

In [2]:
df1=pd.read_csv("./DataSet/dbpedia_csv/train.csv",  header=None)
df2=pd.read_csv("./DataSet/dbpedia_csv/test.csv", header=None)

df = pd.concat([df1,df2])
df.head()

Unnamed: 0,0,1,2
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [3]:
df[0].value_counts()

14    45000
13    45000
12    45000
11    45000
10    45000
9     45000
8     45000
7     45000
6     45000
5     45000
4     45000
3     45000
2     45000
1     45000
Name: 0, dtype: int64

In [4]:
# Escoger subconjunto 

sample = 3000

df1 = df[(df[0] == 1) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df2 = df[(df[0] == 2) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df3 = df[(df[0] == 3) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df4 = df[(df[0] == 4) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df5 = df[(df[0] == 5) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df6 = df[(df[0] == 6) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df7 = df[(df[0] == 7) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df8 = df[(df[0] == 8) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df9 = df[(df[0] == 9) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df10 = df[(df[0] == 10) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df11 = df[(df[0] == 11) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df12 = df[(df[0] == 12) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df13 = df[(df[0] == 13) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)
df14 = df[(df[0] == 14) & df[1].notnull() & df[2].notnull()].sample(sample, random_state=1234)


df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14])

In [5]:
df[0].value_counts()

14    3000
13    3000
12    3000
11    3000
10    3000
9     3000
8     3000
7     3000
6     3000
5     3000
4     3000
3     3000
2     3000
1     3000
Name: 0, dtype: int64

In [6]:
corpus = df[2].values

In [7]:
df[0] = df[0].astype('category')
cat_columns = df.select_dtypes(['category']).columns
label = df[cat_columns].apply(lambda x: x.cat.codes).values.squeeze()

In [8]:
# Definimos los tokenizer y lemmatizer
wpt = nltk.WordPunctTokenizer()
lem = nltk.WordNetLemmatizer()
# Descargamos las stopwords para inglés
stop_words = nltk.corpus.stopwords.words('english')
# Definimos la función de preprocesamiento
def normalize_document(doc):
    # Se eliminan caracteres especiales
    doc = re.sub(r'\n', '', doc)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    # Se convierten los téxtos a minúsculas
    doc = doc.lower()
    doc = doc.strip()
    # Tokenizado de documento
    tokens = wpt.tokenize(doc)
    # Eliminación de stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lematización
    tokens = [lem.lemmatize(token) for token in tokens]
    # Retornamos una versión filtrada del texto
    doc = ' '.join(tokens)
    return doc
# Vectorización de la función
normalize_corpus = np.vectorize(normalize_document)

In [9]:
norm_corpus = normalize_corpus(corpus)

In [10]:
feature_size = 100 
window_context = 10
min_word_count = 1 
sample = 1e-3

In [11]:
wpt = nltk.WordPunctTokenizer()

tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

In [12]:
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,
                              window=window_context, min_count=min_word_count,
                              sample=sample, iter=100)

w2v_representations=[]
for i,sentence in enumerate(tokenized_corpus):
    try:
        # La representación de un documento es el promedio de la representación
        # de cada uno de sus términos.
        w2v_representations.append(w2v_model.wv[sentence].mean(axis=0))
    except:
        # Hay algunos casos que sólo contenían stopwords o caracteres especiales
        # Como no tienen representación vectorial asignamos un vector de zeros.
        w2v_representations.append(np.zeros(shape=(feature_size,)))

X = np.array(w2v_representations)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, label, test_size=0.3, random_state=42, stratify = label)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(29400, 100) (29400,)
(12600, 100) (12600,)


In [14]:
x_train = torch.Tensor(X_train)
x_test = torch.Tensor(X_test)
y_train = torch.Tensor(Y_train).type(torch.long)
y_test = torch.Tensor(Y_test).type(torch.long)

In [15]:
train_set = torch.utils.data.TensorDataset(x_train, y_train)
test_set = torch.utils.data.TensorDataset(x_test, y_test)

In [21]:
# Inicializacion
torch.manual_seed(0)

# Parametros de MPS
bond_dim = [8]
rate = [1e-6,1e-7]

for bond in bond_dim:
    
    for learn_rate in rate:
    
        # Parametros de entrenamiento
        num_train  = len(train_set.tensors[0])
        num_test   = len(test_set.tensors[0])
        batch_size = 50
        num_epochs = 100
        l2_reg     = 0.

        # Inicializar el modulo MPS
        mps = MPS(input_dim=100, output_dim=14, bond_dim=bond, parallel_eval=True)


        # Implementar mapeo de caracteristicas
        def feature_map(data):
            return torch.Tensor([1.0, data])

        mps.register_feature_map(feature_map)

        # Establecer funcion de perdida y optimizador
        loss_fun = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(mps.parameters(), lr=learn_rate, weight_decay=l2_reg)

        samplers = {'train': torch.utils.data.SubsetRandomSampler(range(num_train)),
                    'test': torch.utils.data.SubsetRandomSampler(range(num_test))}

        loaders = {name: torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                   sampler=samplers[name], drop_last=True) for (name, dataset) in 
                   [('train', train_set), ('test', test_set)]}

        num_batches = {name: total_num // batch_size for (name, total_num) in
                       [('train', num_train), ('test', num_test)]}

        cfun = []
        cfun_test = []
        err_train = []
        err_test = []

        # Empecemos a entrenar
        for epoch_num in range(1, num_epochs+1):
            running_loss = 0.
            running_acc = 0.

            for inputs, labels in loaders['train']: # obtener Batch
                inputs, labels = inputs.view([batch_size, 100]), labels.data

                # Llamar nuestro MPS para obtener puntaje logit y predicciones
                scores = mps(inputs) # Pasar Batch
                _, preds = torch.max(scores, 1)

                # Calcule la perdida y la precision, Añadalos a los running totales
                loss = loss_fun(scores, labels) # calcular loss
                with torch.no_grad():
                    accuracy = torch.sum(preds == labels).item() / batch_size
                    running_loss += loss
                    running_acc += accuracy

                # Backpropagate y actualizar parametros
                optimizer.zero_grad()
                loss.backward() # Calcular gradientes
                optimizer.step() # Actualizar pesos

            cfun.append((running_loss / num_batches['train']).item())
            err_train.append(running_acc / num_batches['train'])

            # Evaluar precision sobre clasificador MPS sobre el conjunto de prueba
            with torch.no_grad():
                running_loss = 0.
                running_acc = 0.

                for inputs, labels in loaders['test']:
                    inputs, labels = inputs.view([batch_size, 100]), labels.data

                    # Llamar nuestro MPS para obtener puntaje logit y predicciones
                    scores = mps(inputs)
                    _, preds = torch.max(scores, 1)

                    running_loss += loss_fun(scores, labels) # calcular loss
                    running_acc += torch.sum(preds == labels).item() / batch_size

            cfun_test.append((running_loss / num_batches['train']).item())
            err_test.append(running_acc / num_batches['test'])


        with open(f'./Data/TEST-rate{learn_rate}-BOND{bond}DBPEDIA.txt', 'w') as f:
            for item in err_test:
                f.write("%s\n" % item)

        with open(f'./Data/TRAIN-rate{learn_rate}-BOND{bond}DBPEDIA.txt', 'w') as f:
            for item in err_train:
                f.write("%s\n" % item)

        with open(f'./Data/LOSS-rate{learn_rate}-BOND{bond}DBPEDIA.txt', 'w') as f:
            for item in cfun:
                f.write("%s\n" % item)

        with open(f'./Data/LOSS-TEST-rate{learn_rate}-BOND{bond}DBPEDIA.txt', 'w') as f:
            for item in cfun_test:
                f.write("%s\n" % item)
                
