In [1]:
import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile
#Regex para expresiones
import glob
#Encodear palabras
import codecs
#Lenguaje Natural
import nltk
#Expresiones regulares
import re
#Remueve Acentos
import unidecode
import csv

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from math import log
#Para hacer uso de Google Spreadsheets
import gspread
from oauth2client.service_account import ServiceAccountCredentials

  from ._conv import register_converters as _register_converters


In [2]:
archivos = sorted(glob.glob("Documents/Escuelas/Tec/Octavo\ Semestre/Ciencia\ Cognitiva/SuicideAi/*.txt"))
archivos

[]

In [3]:
corpus_raw = u""
for archivo in archivos:
    print("Estoy leyendo el archivo {0}".format(archivo))
    with codecs.open(archivo, "r", "utf-8") as archivo:
        corpus_raw += archivo.read()
    print("El corpus tiene {0} characteres".format(len(corpus_raw)))
    print()

In [4]:

tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [5]:
naming=[]
with open('Names.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        naming.append(row['Name'].lower())
names=set(naming)

In [6]:
# Limpia characteres, mantiene preguntas, acentos y ñ para terminos de contexto
toktok = ToktokTokenizer()
preposiciones = ["a", "ante", "bajo", "cabe", "con", "contra", "de", "desde", "durante", "nancys" ,"en", "entre", "hacia", "hasta", "mediante", "para", "por", "segun", "sin", "so", "sobre", "tras", "versus","via"]
articulos = ["el", "lo" ,"la", "alex" ,"los", "las", "esas","esa","es","que" ,"nos", "tan", "estas","ella","misma" ,"ello", "algo","cosa","pero", "como", "esta", "eres", "esas", "ha", "eh"]
symbols = re.compile(r'(\W+)', re.U)
stop = stopwords.words('spanish')

import re

contractions = re.compile(r"'|-|\"")
# single character removal
singles = re.compile(r'(\s\S\s)', re.I|re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')

# cleaner (order matters)
def clean(text): 
    text = text.lower()
    text = contractions.sub('', text)
    text = singles.sub(' ', text)
    text = seps.sub(' ', text)
    return text

# sentence splitter
alteos = re.compile(r'([!\?])')
def sentences(l):
    l = alteos.sub(r' \1 .', l).rstrip("(\.)*\n")

def limpia_impureza(raw):
    texto = re.sub("[^a-zA-Zñáéíóúü]"," ",raw)
    texto = texto.lower()
    unaccented_string=unidecode.unidecode(clean(texto)).split()
    result = list(set(unaccented_string) - set(preposiciones)- set(articulos))
    important_words=[]
    for word in result:
        if word is 'no' or 'si' or word not in stop:
            if len(word) > 3 and word not in names:
                important_words.append(word)
        else:
            print(word)
    result=' '.join(important_words)
    return toktok.tokenize(result)

In [7]:
vocabulary = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        vocabulary.append(limpia_impureza(raw_sentence))

In [8]:
print('Data size', len(vocabulary))

Data size 0


In [9]:
def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  word_dictionary = dict(zip(dictionary.keys(), dictionary.values()))
  return data, count, dictionary, reversed_dictionary, word_dictionary

In [10]:
for vocab in vocabulary:
    for word in vocab:
        with open("build.txt", "a") as myfile:
            myfile.write(word+"\n")

def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    corpus_raw = u""
    with codecs.open(filename, "r", "utf-8") as archivo:
        corpus_raw += archivo.read()
    return corpus_raw.split()
            
vocabulario = read_data("build.txt")
print('Data size', len(vocabulario))

Data size 1083282


In [11]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

data, count, dictionary, reverse_dictionary, word_dictionary = build_dataset(
    vocabulario, vocabulary_size)
del vocabulario  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 0], ('cuando', 12025), ('todo', 9716), ('alguien', 9546), ('persona', 7289)]
Sample data [29, 743, 696, 103, 287, 1722, 1134, 2595, 27, 104] ['gente', 'dejes', 'jodido', 'menos', 'necesitas', 'hipocresia', 'sangre', 'alarmo', 'nadie', 'estaba']


In [12]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

In [13]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],reverse_dictionary[labels[i, 0]])

743 dejes -> 696 jodido
743 dejes -> 29 gente
696 jodido -> 103 menos
696 jodido -> 743 dejes
103 menos -> 696 jodido
103 menos -> 287 necesitas
287 necesitas -> 103 menos
287 necesitas -> 1722 hipocresia


In [14]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    with tf.name_scope('embeddings'):
      embeddings = tf.Variable(
          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
      embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    with tf.name_scope('weights'):
      nce_weights = tf.Variable(
          tf.truncated_normal(
              [vocabulary_size, embedding_size],
              stddev=1.0 / math.sqrt(embedding_size)))
    with tf.name_scope('biases'):
      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  with tf.name_scope('loss'):
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=train_labels,
            inputs=embed,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))

  # Add the loss value as a scalar to summary.
  tf.summary.scalar('loss', loss)

  # Construct the SGD optimizer using a learning rate of 1.0.
  with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                            valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Merge all summaries.
  merged = tf.summary.merge_all()

  # Add variable initializer.
  init = tf.global_variables_initializer()

  # Create a saver.
  saver = tf.train.Saver()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [15]:
# Step 5: Begin training.
num_steps = 100001

# Give a folder path as an argument with '--log_dir' to save
# TensorBoard summaries. Default is a log folder in current directory.
current_path = os.path.dirname(os.path.realpath(sys.argv[0]))

parser = argparse.ArgumentParser()
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'log'),
    help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()

# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
    os.makedirs(FLAGS.log_dir)

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips,skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
            [optimizer, merged, loss],
            feed_dict=feed_dict,
            run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ':', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 7  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    try:
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    except KeyError:
                        close_word = reverse_dictionary[len(reverse_dictionary)-1]
                        log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            
    final_embeddings = normalized_embeddings.eval()
        
    # Write corresponding labels for the embeddings.
    with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            try:
                f.write(reverse_dictionary[i] + '\n')
            except KeyError:
                break

    # Save the model for checkpoints.
    saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)
    
writer.close()

Initialized
Average loss at step  0 : 304.2824401855469
Nearest to dormir: cintas, pala, hagas, pala, pala, pala, pala,
Nearest to nada: soltar, pala, pala, pala, pala, pala, pala,
Nearest to mismo: pala, pala, pala, pala, pala, pala, pala,
Nearest to cansado: pala, pala, pala, pala, pala, pala, nuevas,
Nearest to estar: termino, fran, pala, pala, rendi, pala, pala,
Nearest to cosas: pala, pala, pala, pala, pala, pala, pala,
Nearest to bien: pala, pala, pala, suceder, habilidad, pala, pala,
Nearest to problemas: pala, pala, pala, pala, pala, cuello, presentas,
Nearest to momento: pala, pala, pala, coleccion, pala, pala, pala,
Nearest to porque: pala, pala, pala, pala, pala, pala, pala,
Nearest to verdad: pala, pala, pala, pala, pala, rajoy, idolos,
Nearest to nunca: plazos, jabon, pala, pala, pala, pala, queman,
Nearest to quiere: pala, pala, pala, arruinas, pala, pala, pala,
Nearest to hace: pala, pala, pala, sien, pala, pala, pala,
Nearest to estoy: pala, estaba, pala, pala, pala, pa

Average loss at step  52000 : 3.3270055504143237
Average loss at step  54000 : 1.8535230632126332
Average loss at step  56000 : 1.822568397641182
Average loss at step  58000 : 1.8002398593276738
Average loss at step  60000 : 1.784830445215106
Nearest to dormir: asesinar, acudir, guardarme, perfil, hagas, pasiva, minima,
Nearest to nada: pesadilla, soltar, inflarlo, zombi, cumplieran, madina, descargarme,
Nearest to mismo: anterior, ventaja, estresare, ojeras, pinte, reflexionar, fallan,
Nearest to cansado: atado, jugamos, eterna, febrero, loaf, curas, esperan,
Nearest to estar: termino, sonria, estupidez, internacional, desee, escenario, tuits,
Nearest to cosas: levantar, tardio, comenzado, coran, incapaz, consuela, simulacro,
Nearest to bien: miradas, verdadero, andas, digan, consumiendo, inversa, esperaron,
Nearest to problemas: insepulto, presentas, insistente, habilidad, desee, cuantos, juan,
Nearest to momento: suma, mantenia, millones, tropezarnos, leche, propongo, escucharme,
Ne

In [16]:

# Step 6: Visualize the embeddings.


# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,xy=(x, y),xytext=(5, 2),textcoords='offset points',ha='right',va='bottom')
    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
    
except ImportError as ex:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
    print(ex)

In [17]:
plot_only = 3
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
print(labels)
print(final_embeddings[:plot_only, :])

['UNK', 'cuando', 'todo']
[[ 5.63721322e-02  1.37281328e-01  6.68083578e-02  3.10999006e-02
  -1.29610300e-02  1.30070299e-01  4.46993336e-02 -1.52785957e-01
  -2.01562028e-02  9.42291543e-02  1.48578599e-01 -8.40875730e-02
   4.40414511e-02  1.15703858e-01 -8.48140493e-02 -1.35610491e-01
  -1.35987177e-01 -1.25011221e-01 -3.91819291e-02 -1.37329429e-01
   1.43736387e-02 -1.08650148e-01  6.85817301e-02 -3.03956755e-02
  -6.04532892e-03 -8.99951905e-02  7.10171908e-02  7.72865415e-02
  -1.12918012e-01 -9.09323916e-02  9.37023908e-02 -1.33120254e-01
  -5.96815161e-02  2.24603880e-02 -2.61536539e-02 -3.39178480e-02
   1.22634321e-01 -1.76194124e-02  6.00935332e-02 -1.37340546e-01
   6.61149994e-02  6.07540384e-02  1.20361455e-01  7.77095705e-02
  -3.19932005e-04  1.49236068e-01 -7.87371024e-02 -1.13334864e-01
  -4.65366198e-03 -1.32439792e-01 -1.54189438e-01 -7.26279020e-02
   3.71317789e-02 -7.25763515e-02  6.45924360e-02  1.03671871e-01
   1.08564653e-01  5.08054998e-03 -2.54560225e-02 

In [32]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('Credenciales.json', scope)

gc = gspread.authorize(credentials)

sht1 = gc.open_by_url('https://docs.google.com/spreadsheets/d/1j5uZP0eo4a9FdhM9sKHxb3AVObnsOgB0BBz3lLzOlDI/edit#gid=0')

ws = sht1.get_worksheet(0)
ws2 = sht1.get_worksheet(1)

In [33]:
def normalize(arr):
    total = 0
    for j in arr:
        total += float(j)
    return total


import tweepy
consumer_key = "HSzv7R6stxA2MPYRgpaACZ5ef"
consumer_secret = "4EAMUKdFYV6xcugD67buBt75yVftYWZyNerxyWY8CI0Fbp2Y3e"
access_token = "432589386-UfuQTmjwQHGB7TKIZwEK1QB4vqU3zQFRnTvjoOsr"
access_token_secret = "jWgl243HcySFZ0z19zz3154IIUqU9YDzKqCJga9CKPhmn"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
tt = 100
promSui = 0
results = api.user_timeline(id="MaruManriquez18", count=tt)
contador = 1
for tweet in results:
    contador=contador+1
    print(tweet.text)
    valid_examples = limpia_impureza(tweet.text)

    acum=0
    for j in valid_examples:
        try:
            i = int(dictionary[j])
            valid_word = reverse_dictionary[i]
            top_k = 7  # number of nearest neighbors
            nearest = (final_embeddings[i, :]).argsort()[1:top_k + 1]
            total = []
            for k in xrange(top_k):
                low_dim_embs = final_embeddings[nearest[k], :]
                promedio = 0
                for j in low_dim_embs:
                    promedio+=float(j)
                promedio = promedio/128
                total.append(promedio)
            norm = normalize(total)
            acum+=norm
            print(valid_word + " -> " + str(norm) + " -> "+str(1-norm))
        except KeyError:
            print(j + " -> " + str(0))
    ws.update_cell(contador, 1, tweet.text)
    nue = (acum/len(valid_examples))*100
    if(nue == 0.0):
        print("El porcentaje de suicidarte es de: 0")
        ws.update_cell(contador, 2, 0)
    else:
        val = y = ((1 / (1 + math.exp(-nue)))-0.5)*2
        promSui += val
        print("El porcentaje de suicidarte es de: %.2f" % val)
        ws.update_cell(contador, 2, val)
        
promSui /= tt
ws2.update_cell(2,1,promSui)

Mierda!, es que posta.. Me gustabas mucho ... Pero solo soy un cero a la izquierda...
gustabas -> 0
izquierda -> 0.009934629631231928 -> 0.9900653703687681
mucho -> 0.005601306948278761 -> 0.9943986930517212
posta -> 0.012234785302780438 -> 0.9877652146972196
cero -> 0.022558496170177023 -> 0.977441503829823
mierda -> 0.008338760218180141 -> 0.9916612397818199
El porcentaje de suicidarte es de: 0.45
No puedo seguir con esto.... No cuando, no siento justo querer a alguien, cuando no me quiero a mi misma...
siento -> 0.005646026751378486 -> 0.9943539732486215
puedo -> 0.020667304594098823 -> 0.9793326954059012
alguien -> 0.02365864252078609 -> 0.9763413574792139
cuando -> -0.01999558494981102 -> 1.019995584949811
esto -> -0.015585149779724361 -> 1.0155851497797244
seguir -> -0.0021855823524674634 -> 1.0021855823524675
querer -> 0.028522141997768813 -> 0.9714778580022312
quiero -> 0.0568789449880569 -> 0.9431210550119431
El porcentaje de suicidarte es de: 0.54
No se que pasa.... Solo teng

RT @eldiariodedross: Perturbadross https://t.co/icLei7XrAo
perturbadross -> 0
https -> 0.0004830377929110341 -> 0.999516962207089
xrao -> 0
iclei -> 0
eldiariodedross -> 0
El porcentaje de suicidarte es de: 0.00
💔🌻


ZeroDivisionError: division by zero