In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import time
from numba import cuda # Biblioteca para usar CUDA
from numba import vectorize # Wrapper para compilar funciones python a C++

In [2]:
print(cuda.gpus)

<Managed Device 0>


In [3]:
pdata = pd.read_json('News_Category_Dataset_v2.json', lines=True)

In [4]:
pdata.category.unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

In [5]:
print(pdata.iloc[0])

authors                                                Melissa Jeltsen
category                                                         CRIME
date                                               2018-05-26 00:00:00
headline             There Were 2 Mass Shootings In Texas Last Week...
link                 https://www.huffingtonpost.com/entry/texas-ama...
short_description    She left her husband. He killed their children...
Name: 0, dtype: object


In [6]:
def onlyText(string):
    size = len(string)
    aux = ""
    for i in range(size):
        if (not(string[i] >= '0' and string[i] <= '9')):
            aux += string[i]
    return aux

In [7]:
[size, items] = pdata.shape
size = 20000
corpus = []
full_corpus = []
category_array = []
for i in np.arange(size):
    value = pdata.iloc[i].category
    if (value == 'POLITICS' or value == 'ENTERTAINMENT' or value == 'WORLD NEWS'):        
        aux = pdata.iloc[i, 3] + " " + pdata.iloc[i, 5]
        full_corpus.append(aux)
        aux = onlyText(aux.lower())
        corpus.append(aux)
        category_array.append(value)
        

In [8]:
len(full_corpus)

11544

In [9]:
corpus[2]

"jim carrey blasts 'castrato' adam schiff and democrats in new artwork the actor gives dems an ass-kicking for not fighting hard enough against donald trump."

In [10]:
corpus[0:3]

["will smith joins diplo and nicky jam for the  world cup's official song of course it has a song.",
 'hugh grant marries for the first time at age  the actor and his longtime girlfriend anna eberstein tied the knot in a civil ceremony.',
 "jim carrey blasts 'castrato' adam schiff and democrats in new artwork the actor gives dems an ass-kicking for not fighting hard enough against donald trump."]

In [11]:
len(corpus)

11544

In [12]:
# Quitar mayusculas y números de los textos.
# Realizar un filtro con 3 categorías y etiquetar los headers
# vectorizar las weas [Utilizar distancias coseno]
# guardar malla en un archivo (quien sabe de  que wea xD)
# El corpus es un diccionario de palabas únicas en todo el conjunto de texto

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [14]:
# MinMaxScaler
mms = MinMaxScaler()
XX = mms.fit_transform(X.toarray())
XX = XX.astype(np.float32)



In [15]:
XX.shape, XX.dtype

((11544, 19872), dtype('float32'))

In [16]:
#Definición de constantes
WIDTH = 20
DEEP = XX.shape[1]
NEWS_NUM = XX.shape[0]

In [17]:
W = np.random.uniform(low=0, high=1, size=WIDTH * WIDTH * DEEP).reshape(WIDTH, WIDTH, DEEP)
W = W.astype(np.float32)

In [18]:
def ganador (W, X):
    Wreshape = W.reshape(W.shape[0] * W.shape[1], W.shape[2])
    vstack = np.vstack([Wreshape, X])
    similarity = cosine_similarity(vstack)
    last_vector = similarity[similarity.shape[0]-1,:]
    sim_vector = last_vector[0:last_vector.shape[0]-2]
    indx = np.argmax(sim_vector)
    max_sim = sim_vector[indx]
    ii = int(indx/W.shape[0])
    jj = indx%W.shape[0]
    return ii, jj, max_sim
            

In [19]:
ganador(W, XX[0])

(0, 4, 0.03234138)

In [33]:
W.shape

(20, 20, 19872)

In [89]:
# Funciones para sumar y restar vectores en CUDA
@cuda.jit
def add_kernel(x, y, out):
    tx = cuda.threadIdx.x # Bloque 1D para el ID único del hilo
    ty = cuda.blockIdx.x # Malla 1D para el ID único del bloque

    block_size = cuda.blockDim.x # Número de hilos por bloque
    grid_size = cuda.gridDim.x # Número de bloques en la malla
    
    start = tx + ty * block_size
    stride = block_size * grid_size

    # Paralelización de operaciones. El wrapper se encarga del cuda.memalloc y el cuda.free
    # Para utilizar memoria compartida se debe especificar el flujo de la memoria entre CPU y GPU
    # Manejo de memoria:
    # x_device = cuda.to_device(x)
    # y_device = cuda.to_device(y)
    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]

@cuda.jit()
def training_kernel(W, x, width, manhatan, n, out):
    startX = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
    startY = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
    gridX = cuda.gridDim.x * cuda.blockDim.x;
    gridY = cuda.gridDim.y * cuda.blockDim.y;
    
    for i in range(startX, width, gridX):
        for j in range(startY, width, gridY):
            k = W[i,j] + (n * 1.0/(1+manhatan) * (x - W[i,j]))
            out[i, j] = k
        

In [93]:
def entrenamiento(W, width, ii, jj, x, neighbour=3, n=0.5):
    threads_per_block = 128
    blocks_per_grid = 30
    for i in range(ii-neighbour, ii+neighbour+1):
        for j in range(jj-neighbour, jj+neighbour+1):
            if( i >= 0 and i < width and j >= 0 and j < width):
                manhatan = abs(ii-i) + abs(jj-j)
                W_out = np.empty_like(W)
                training_kernel[blocks_per_grid, threads_per_block](W[i,j].tolist(), x.tolist(), width, manhatan, n, W_out.tolist())
                W = W_out
                
    return W

In [94]:
# Función lista de listas
def listalista(width):
    l = []
    for i in range(width):
        tmp = []
        for j in range(width):
            tmp.append([])
        l.append(tmp)
    return l

In [95]:
#np.vstack(W.reshape(width*width, numDims), X))
#Usar la ultima posicion de cosine
#modulo da la columna, div entera renglon
XX[1].dtype

dtype('float32')

In [96]:
# Entrenamiento
RESHUFFLE_ITERATIONS = 100
n = 0.5
t = time.time()
print("Iteraciones de entrenamiento totales = " + str(RESHUFFLE_ITERATIONS * NEWS_NUM))
print("Inicio de entrenamiento")
for i in range(RESHUFFLE_ITERATIONS):
    indx = np.arange(NEWS_NUM)
    np.random.shuffle(indx) 
    for j in range(NEWS_NUM):
        ii, jj, winner = ganador(W, XX[indx[j]])
        W = entrenamiento(W, WIDTH, ii, jj, XX[indx[j]], 5, n)
    time_elapsed = time.time() - t
    n *= 0.9
    print('Reshuffle iteracion #{0}.\tTiempo de ejecución: {1:d} minutos y {2:.2f} segundos'.format(i + 1, int(time_elapsed/60), time_elapsed%60))
time_elapsed = time.time() - t
print('Tiempo total de entrenamiento: {0:d} minutos y {1:.2f} segundos'.format(int(time_elapsed/60), time_elapsed%60))

Iteraciones de entrenamiento totales = 1154400
Inicio de entrenamiento


TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mInvalid use of Function(<built-in function getitem>) with argument(s) of type(s): (reflected list(float64), tuple(int64 x 2))
 * parameterized
[1mIn definition 0:[0m
[1m    All templates rejected with literals.[0m
[1mIn definition 1:[0m
[1m    All templates rejected without literals.[0m
[1mIn definition 2:[0m
[1m    All templates rejected with literals.[0m
[1mIn definition 3:[0m
[1m    All templates rejected without literals.[0m
[1mIn definition 4:[0m
[1m    All templates rejected with literals.[0m
[1mIn definition 5:[0m
[1m    All templates rejected without literals.[0m
[1mThis error is usually caused by passing an argument of a type that is unsupported by the named function.[0m[0m
[0m[1m[1] During: typing of intrinsic-call at <ipython-input-89-7cfb4484fb11> (30)[0m
[1m
File "<ipython-input-89-7cfb4484fb11>", line 30:[0m
[1mdef training_kernel(W, x, width, manhatan, n, out):
    <source elided>
        for j in range(startY, width, gridY):
[1m            out[i, j] = W[i,j] + (n * 1.0/(1+manhatan) * (x - W[i,j]))
[0m            [1m^[0m[0m


In [None]:
# instancia de la lista de listas
ll = listalista(WIDTH)

In [None]:
# Resultados

for i in np.arange(len(corpus)):
    ii, jj, winner = ganador(W, XX[i])
    ll[ii][jj].append(i)

In [None]:
for i in np.arange(len(ll)):
    print(ll[i])

In [None]:
ii = 0
jj = 2
for i in np.arange(len(ll[ii][jj])):
    #print(ll[0][1][i])
    indx = ll[ii][jj][i]
    print(category_array[indx] + " " + full_corpus[indx] + "\n")