In [1]:
import black
import fasttext
import jupyter_black
import json
import keras
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import seaborn as sns
import tensorflow as tf
import random
from collections import Counter
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    LSTM,
    GRU
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score
from sklearn.model_selection import (
    cross_val_score,
    train_test_split,
    StratifiedKFold,
)
import xml.etree.ElementTree as ET

# Setting options
pd.options.mode.chained_assignment = None  # default='warn'
nltk.download("stopwords")
nltk.download("punkt")
#black for linting reasons
jupyter_black.load(
    lab=False,
    line_length=80,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)
pd.set_option('display.max_colwidth', None)
print("All good!")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cacu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Cacu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
DEBUG:jupyter_black:config: {'line_length': 80, 'target_versions': {<TargetVersion.PY310: 10>}}


<IPython.core.display.Javascript object>

All good!


In [None]:
csv_file_path = r"C:\Users\Cacu\Desktop\Universidad\Trabajo_Final\DataSets\open-dataset-for-sentiment-analysis-master\betsentiment-ES-tweets-sentiment-worldcup.csv"


# Function to extract sentiment scores from the json str
def extract_sentiment_scores(json_str):
    sentiment_data = json.loads(json_str)
    return (
        sentiment_data["Neutral"],
        sentiment_data["Negative"],
        sentiment_data["Positive"],
        sentiment_data["Mixed"],
    )


# Define data types
dtype_dict = {
    "tweet_date_created": str,
    "tweet_id": int,
    "tweet_text": str,
    "language": str,
    "sentiment": str,
}

try:
    df = pd.read_csv(csv_file_path, encoding="utf-8", dtype=dtype_dict)
except UnicodeDecodeError:
    df = pd.read_csv(csv_file_path, encoding="latin-1", dtype=dtype_dict)

sentiment_scores_list = df["sentiment_score"].map(extract_sentiment_scores)

sentiment_scores_df = pd.DataFrame(
    sentiment_scores_list.tolist(),
    columns=["Neutral", "Negative", "Positive", "Mixed"],
)

df = pd.concat([df, sentiment_scores_df], axis=1)

df.drop("sentiment_score", axis=1, inplace=True)

In [None]:
df

In [None]:
# Read the stopwords from the .txt file and store them in a list
custom_stopwords_file = (
    r"C:\Users\Cacu\Desktop\Universidad\Trabajo_Final\utilities\stopwords.txt"
)
with open(custom_stopwords_file, "r", encoding="utf-8") as file:
    custom_stopwords_list = [line.strip() for line in file]

# Create an empty set to hold the stopwords
custom_stopwords_set = set()

# Add the stopwords from the list to the set
custom_stopwords_set.update(custom_stopwords_list)

In [None]:
# Specify the Snowball stemmer for Spanish
stemmer = SnowballStemmer("spanish")

# 01 - Convert NaN values to an empty string
df["tweet_text"] = df["tweet_text"].fillna("")

# 02 - Lowercasing
df["tweet_text"] = df["tweet_text"].str.lower()

# 03 - Removing punctuation
df["tweet_text"] = df["tweet_text"].str.replace("[^\w\s]", "")

# 04 - Tokenization
df["tokens"] = df["tweet_text"].apply(word_tokenize)

# 05 - Removing stop words
stop_words = set(stopwords.words("spanish"))
df["filtered_tokens"] = df["tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# 06 - Stemming
df["stemmed_tokens"] = df["filtered_tokens"].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

In [None]:
df

## 1. Embedding

fastText es una biblioteca para el aprendizaje de incrustaciones de palabras y clasificación de texto creada por el laboratorio de Investigación de Inteligencia Artificial de Facebook (FAIR). El modelo es un algoritmo de aprendizaje no supervisado para obtener representaciones vectoriales de palabras. Facebook pone a disposición modelos preentrenados para 294 idiomas. fastText utiliza una red neuronal para la incrustación de palabras [Fuente: Wikipedia].

Documentación sobre Gensim: models.fasttext

FastText es una extensión de Word2Vec propuesta por Facebook en 2016. En lugar de alimentar palabras individuales en la Red Neuronal, FastText divide las palabras en varios n-gramas (subpalabras). Por ejemplo, los trigramas para la palabra "manzana" son "man," "anz," y "nza" (ignorando los límites de inicio y fin de las palabras). El vector de incrustación de la palabra "manzana" será la suma de todos estos n-gramas. Después de entrenar la Red Neuronal, tendremos incrustaciones de palabras para todos los n-gramas dados el conjunto de datos de entrenamiento. Las palabras raras ahora pueden representarse adecuadamente, ya que es muy probable que algunos de sus n-gramas también aparezcan en otras palabras. Te mostraré cómo usar FastText con Gensim en la siguiente sección.

In [None]:
df["tokens_as_string"] = df["tokens"].apply(" ".join)
# Save the stemmed tokens to a text file (one sentence per line)
with open("tokens.txt", "w") as f:
    f.write("\n".join(df["tokens_as_string"]))

In [None]:
# Fasttext model on stemmed tokens
model = fasttext.train_unsupervised("tokens.txt", model="skipgram")

# Create tweet-level embeddings using the trained model
embeddings = []

In [None]:
for tokens in df["tokens_as_string"]:
    vector = model.get_sentence_vector(tokens)
    embeddings.append(vector)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(embeddings)

In [None]:
# embeddings df with the original dataset
df_worldcup_embeddings = pd.concat([df, embedding_df], axis=1)

# embedding df to csv
df_worldcup_embeddings.to_csv("2018_dataset_with_embeddings.csv", index=False)

model.save_model("model_worldcup_embedding.bin")

In [None]:
df_worldcup_embeddings

In [82]:
nrows_to_load = 350000  # Adjust this to the desired subset size

# start from here - csv already generated
df = pd.read_csv("2018_dataset_with_embeddings.csv", nrows=nrows_to_load)
# shuffle the DataFrame rows
df = df.sample(frac=1)

## Preparing for training

In [83]:
df.count()

tweet_date_created    350000
tweet_id              350000
tweet_text            350000
language              350000
sentiment             350000
                       ...  
95                    350000
96                    350000
97                    350000
98                    350000
99                    350000
Length: 113, dtype: int64

In [85]:
label_mapping = {"POSITIVE": 2, "NEUTRAL": 1, "NEGATIVE": 0}
df["sentiment_encoded"] = df["sentiment"].map(label_mapping)

In [87]:
print(df.isnull().sum())

# Remove rows with missing values
df.dropna(inplace=True)

# Remove duplicates
df.drop_duplicates(subset=["tweet_text"], keep="first", inplace=True)

tweet_date_created       0
tweet_id                 0
tweet_text               0
language                 0
sentiment                0
                      ... 
96                       0
97                       0
98                       0
99                       0
sentiment_encoded     2132
Length: 114, dtype: int64


In [88]:
df

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,Neutral,Negative,Positive,Mixed,tokens,filtered_tokens,stemmed_tokens,tokens_as_string,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,sentiment_encoded
60151,2018-06-28T16:12:59.384000,1784107008,másunidosquenunca conlafeintacta vamoscolombia conlatricolorpuesta mundialrusia2018 vamosaganar nonosvamosarendir gracias colombia gracias x vencer el socialismo xxi gracias farid mondragon y gracias por avanzar estamosmelospaoctavos\nmlagder pilica7 mhoyos333 httpstcov2uudmircq,es,POSITIVE,0.350060,0.000235,0.644539,0.005166,"['másunidosquenunca', 'conlafeintacta', 'vamoscolombia', 'conlatricolorpuesta', 'mundialrusia2018', 'vamosaganar', 'nonosvamosarendir', 'gracias', 'colombia', 'gracias', 'x', 'vencer', 'el', 'socialismo', 'xxi', 'gracias', 'farid', 'mondragon', 'y', 'gracias', 'por', 'avanzar', 'estamosmelospaoctavos', 'mlagder', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']","['másunidosquenunca', 'conlafeintacta', 'vamoscolombia', 'conlatricolorpuesta', 'mundialrusia2018', 'vamosaganar', 'nonosvamosarendir', 'gracias', 'colombia', 'gracias', 'x', 'vencer', 'socialismo', 'xxi', 'gracias', 'farid', 'mondragon', 'gracias', 'avanzar', 'estamosmelospaoctavos', 'mlagder', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']","['masunidosquenunc', 'conlafeintact', 'vamoscolombi', 'conlatricolorpuest', 'mundialrusia2018', 'vamosagan', 'nonosvamosarend', 'graci', 'colombi', 'graci', 'x', 'venc', 'social', 'xxi', 'graci', 'far', 'mondragon', 'graci', 'avanz', 'estamosmelospaoctav', 'mlagd', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']",másunidosquenunca conlafeintacta vamoscolombia conlatricolorpuesta mundialrusia2018 vamosaganar nonosvamosarendir gracias colombia gracias x vencer el socialismo xxi gracias farid mondragon y gracias por avanzar estamosmelospaoctavos mlagder pilica7 mhoyos333 httpstcov2uudmircq,-0.029059,0.089238,-0.099062,0.060049,0.159505,0.221087,0.059089,-0.063663,0.054963,0.018248,0.054835,-0.024155,-0.028379,-0.075933,0.009605,0.123610,-0.075535,0.003957,0.056147,0.054395,0.069691,0.078577,-0.029189,-0.030220,0.070053,-0.000857,0.061832,-0.003341,0.054546,0.127368,-0.057978,0.000042,-0.023296,-0.004476,-0.012103,0.025943,0.053024,0.035395,-0.064056,0.107801,0.056701,-0.022982,0.092009,-0.009205,0.050853,-0.086181,0.092590,-0.032310,-0.062916,-0.067546,-0.064600,-0.068724,-0.021663,0.103924,-0.016985,0.005648,-0.071782,0.060273,0.007180,0.029772,0.006658,0.047375,-0.005312,-0.008307,0.084346,0.022886,-0.063585,-0.019063,-0.054511,0.139538,-0.012181,0.016962,-0.060934,-0.048208,0.066808,0.038369,-0.007719,0.060613,0.009058,0.046057,-0.009863,0.079630,0.017453,-0.006552,0.037371,-0.099145,0.089840,0.027956,-0.054526,0.010925,-0.047625,0.024872,0.041494,-0.017940,-0.021979,-0.050140,0.054040,-0.049168,-0.055790,0.049390,2.0
70375,2018-06-22T12:18:11,-1017774079,para cuándo la fcfseleccioncol jugándole como costa rica a brasil,es,NEUTRAL,0.978532,0.006588,0.012492,0.002388,"['para', 'cuándo', 'la', 'fcfseleccioncol', 'jugándole', 'como', 'costa', 'rica', 'a', 'brasil']","['cuándo', 'fcfseleccioncol', 'jugándole', 'costa', 'rica', 'brasil']","['cuand', 'fcfseleccioncol', 'jug', 'cost', 'ric', 'brasil']",para cuándo la fcfseleccioncol jugándole como costa rica a brasil,-0.025459,0.043306,-0.114674,0.107244,0.160624,0.195959,0.105956,-0.005502,0.058612,0.012669,0.129113,-0.093427,0.019399,-0.099694,0.053587,0.064284,-0.040992,-0.008989,0.063830,0.034736,0.054009,0.078707,-0.055019,0.002308,0.030089,0.043087,0.013144,-0.019816,0.046936,0.094541,-0.015879,-0.136571,0.051188,0.021494,-0.151703,0.084773,0.096507,-0.050720,-0.065546,0.039195,0.113884,-0.014540,0.049530,0.076624,0.025680,-0.085711,0.062923,0.006464,0.020467,-0.044417,-0.012207,-0.002950,0.023108,0.070480,-0.024376,0.044859,-0.110032,0.029538,-0.048885,0.060548,-0.048756,0.009855,-0.026975,0.031436,0.013617,-0.042241,-0.061825,-0.073113,-0.021133,0.176416,0.036064,0.068783,-0.089930,0.024474,0.020292,0.024166,-0.036761,-0.012150,0.006574,0.085138,0.089706,0.098435,-0.040762,0.068768,-0.022880,-0.020912,0.018412,0.052602,-0.076928,-0.003735,0.020007,0.018688,0.072371,0.000398,-0.049879,-0.071670,-0.006112,-0.068310,0.022114,0.012613,1.0
232645,2018-06-28T01:54:36.933000,815239173,sondeobahialinda fcfseleccioncol avanza a octavos de final de fifaworldcup_es gracias por participar si das retuit otras personas podrán opinar,es,NEUTRAL,0.751145,0.015220,0.207554,0.026081,"['sondeobahialinda', 'fcfseleccioncol', 'avanza', 'a', 'octavos', 'de', 'final', 'de', 'fifaworldcup_es', 'gracias', 'por', 'participar', 'si', 'das', 'retuit', 'otras', 'personas', 'podrán', 'opinar']","['sondeobahialinda', 'fcfseleccioncol', 'avanza', 'octavos', 'final', 'fifaworldcup_es', 'gracias', 'participar', 'si', 'das', 'retuit', 'personas', 'podrán', 'opinar']","['sondeobahialind', 'fcfseleccioncol', 'avanz', 'octav', 'final', 'fifaworldcup_', 'graci', 'particip', 'si', 'das', 'retuit', 'person', 'podran', 'opin']",sondeobahialinda fcfseleccioncol avanza a octavos de final de fifaworldcup_es gracias por participar si das retuit otras personas podrán opinar,-0.038498,0.092658,-0.085193,0.074903,0.172392,0.139334,0.070888,-0.030142,0.070441,-0.007076,0.123035,-0.087316,-0.011625,-0.089243,0.010598,0.139434,-0.112833,-0.029454,0.050642,0.014475,0.066005,0.074343,-0.020879,0.026893,0.072591,0.039455,0.064366,-0.005532,0.030542,0.076821,-0.008554,-0.079624,-0.000779,-0.010920,-0.075749,0.021891,0.087513,0.009929,-0.095223,0.062236,0.068670,0.030537,0.100545,0.000777,0.029045,-0.034543,0.024093,0.008222,-0.010788,-0.022825,-0.058130,-0.018610,0.020422,0.075978,0.000183,0.039558,-0.041861,0.016465,0.040309,0.075201,-0.011622,0.026906,-0.018503,0.041545,0.053020,0.008578,-0.040196,-0.020956,-0.058936,0.129149,0.041503,0.061106,-0.102242,-0.063055,0.041232,0.042524,0.004540,-0.033103,-0.002749,0.065801,0.086728,0.057673,0.038678,-0.001943,0.012085,-0.008174,0.107566,0.039698,-0.063723,0.004585,-0.051668,0.010095,0.108617,0.015707,-0.012517,-0.074720,-0.003046,-0.029250,-0.036150,0.031267,1.0
14101,2018-05-27T04:16:46,-1399468032,briansa28060006 miseleccionmx algo parecido decían en aquel partido de tercera ronda de fa cup hereford enfrentaba al poderoso newcastle de macdonald lo demás es historia httpstcopffkchpgn2,es,NEUTRAL,0.807122,0.038166,0.136453,0.018259,"['briansa28060006', 'miseleccionmx', 'algo', 'parecido', 'decían', 'en', 'aquel', 'partido', 'de', 'tercera', 'ronda', 'de', 'fa', 'cup', 'hereford', 'enfrentaba', 'al', 'poderoso', 'newcastle', 'de', 'macdonald', 'lo', 'demás', 'es', 'historia', 'httpstcopffkchpgn2']","['briansa28060006', 'miseleccionmx', 'parecido', 'decían', 'aquel', 'partido', 'tercera', 'ronda', 'fa', 'cup', 'hereford', 'enfrentaba', 'poderoso', 'newcastle', 'macdonald', 'demás', 'historia', 'httpstcopffkchpgn2']","['briansa28060006', 'miseleccionmx', 'parec', 'dec', 'aquel', 'part', 'tercer', 'rond', 'fa', 'cup', 'hereford', 'enfrent', 'poder', 'newcastl', 'macdonald', 'demas', 'histori', 'httpstcopffkchpgn2']",briansa28060006 miseleccionmx algo parecido decían en aquel partido de tercera ronda de fa cup hereford enfrentaba al poderoso newcastle de macdonald lo demás es historia httpstcopffkchpgn2,0.003865,0.049860,-0.099769,0.056782,0.151591,0.158307,0.055609,-0.054735,0.023316,-0.007576,0.103678,-0.028259,0.025342,-0.065894,0.026852,0.105057,-0.055998,-0.000320,0.050555,0.033259,0.037757,0.074706,-0.077085,-0.004926,0.057498,0.053739,0.097696,-0.061206,0.021930,0.113087,-0.000339,-0.142970,0.013738,0.006194,-0.120748,0.045394,0.077567,0.022510,-0.050040,0.051580,0.116538,-0.019421,0.109968,-0.027044,0.008548,-0.064142,0.002302,-0.024634,0.015827,-0.003306,-0.024176,-0.001803,0.034689,0.123042,-0.024925,0.073728,-0.053777,-0.036464,0.030956,0.076912,-0.080225,0.029142,0.015844,0.029858,0.014861,-0.004336,-0.010926,-0.009192,-0.038576,0.161950,0.019742,0.045517,-0.070968,-0.052140,0.047850,0.011796,-0.038979,0.015676,0.024589,0.071631,0.039785,0.038598,0.001736,0.020920,-0.041757,0.026955,0.077167,0.029651,-0.091344,0.032060,-0.028149,0.029808,0.077089,-0.008362,-0.057512,-0.081587,-0.023911,-0.031327,-0.024364,0.022909,1.0
25314,2018-06-25T17:34:21,-216539134,creo que llegamos acuartos lo siento pero creo que la sefutbol se cae antes de semis y tú liberaloquesientes en nuestracopa,es,NEUTRAL,0.404293,0.291237,0.103653,0.200817,"['creo', 'que', 'llegamos', 'acuartos', 'lo', 'siento', 'pero', 'creo', 'que', 'la', 'sefutbol', 'se', 'cae', 'antes', 'de', 'semis', 'y', 'tú', 'liberaloquesientes', 'en', 'nuestracopa']","['creo', 'llegamos', 'acuartos', 'siento', 'creo', 'sefutbol', 'cae', 'semis', 'liberaloquesientes', 'nuestracopa']","['cre', 'lleg', 'acuart', 'sient', 'cre', 'sefutbol', 'cae', 'semis', 'liberaloquesient', 'nuestracop']",creo que llegamos acuartos lo siento pero creo que la sefutbol se cae antes de semis y tú liberaloquesientes en nuestracopa,-0.012956,0.038656,-0.029059,0.085300,0.171135,0.203375,0.082519,-0.101758,0.047706,-0.034739,0.118921,-0.068021,-0.000702,-0.054345,0.027981,0.107607,-0.121319,-0.020568,0.040970,0.024316,0.084084,0.007137,-0.062161,0.006358,0.029319,0.052733,0.085000,-0.047910,-0.003158,0.132708,-0.029517,-0.159212,-0.016833,0.007452,-0.179993,0.048372,0.054405,0.057704,-0.052625,0.032404,0.051158,-0.004924,0.124484,-0.042853,-0.004547,-0.029379,0.018346,-0.014134,0.012154,-0.020898,-0.006938,-0.036874,0.042938,0.129090,-0.009299,0.026190,-0.081128,-0.020999,0.090503,0.061707,-0.021594,-0.029597,-0.071597,0.078393,-0.071793,0.026715,-0.102311,-0.084775,-0.012751,0.126799,0.042978,0.037369,-0.037034,-0.026358,0.023738,-0.007208,-0.030593,0.012254,0.016681,0.075934,0.089524,0.023083,0.016865,0.024628,0.029534,0.033512,0.057358,-0.002124,-0.002758,0.076347,0.030757,-0.002348,0.112446,-0.056201,-0.040363,-0.169861,-0.017810,-0.018384,-0.044457,0.015030,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326267,2018-05-16T02:47:49,-145313791,el domingo 27 argentina entrenará a puertas abiertas en el estadio amalfitani gran iniciativa para que ingresen alumnos de las escuelas de caba y gran bs as httpstcowvp2azjqyh,es,NEUTRAL,0.880786,0.059434,0.050053,0.009727,"['el', 'domingo', '27', 'argentina', 'entrenará', 'a', 'puertas', 'abiertas', 'en', 'el', 'estadio', 'amalfitani', 'gran', 'iniciativa', 'para', 'que', 'ingresen', 'alumnos', 'de', 'las', 'escuelas', 'de', 'caba', 'y', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']","['domingo', '27', 'argentina', 'entrenará', 'puertas', 'abiertas', 'estadio', 'amalfitani', 'gran', 'iniciativa', 'ingresen', 'alumnos', 'escuelas', 'caba', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']","['doming', '27', 'argentin', 'entren', 'puert', 'abiert', 'estadi', 'amalfitani', 'gran', 'inici', 'ingres', 'alumn', 'escuel', 'cab', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']",el domingo 27 argentina entrenará a puertas abiertas en el estadio amalfitani gran iniciativa para que ingresen alumnos de las escuelas de caba y gran bs as httpstcowvp2azjqyh,-0.051549,0.061011,-0.063376,0.041306,0.095232,0.132294,0.101941,-0.089577,0.025458,0.011424,0.121157,-0.025566,-0.025312,-0.072683,0.026776,0.142752,-0.075512,0.033897,0.070762,0.018415,0.086743,0.122432,-0.073439,0.008200,0.006707,0.047511,0.084368,-0.058080,0.037879,0.108489,-0.017745,-0.097516,0.048869,0.045748,-0.146609,0.058247,0.078037,0.067279,-0.093812,0.037309,0.038848,-0.017222,0.093509,0.017076,0.057393,-0.086929,0.060876,0.019618,-0.016461,-0.059271,-0.029777,0.057963,0.080524,0.069687,-0.007816,0.028449,-0.058005,-0.012977,0.008556,0.067330,-0.053717,0.053006,0.028911,0.014186,0.039478,-0.008043,-0.019827,-0.015984,-0.020956,0.193638,0.058800,0.018315,-0.062159,-0.072707,0.040598,0.007520,-0.032723,0.046036,0.059565,0.055371,0.102447,0.104503,0.039041,-0.013246,-0.019756,0.044394,0.019204,0.034734,-0.084133,-0.010270,0.016535,-0.009201,0.097082,0.011580,0.022317,-0.077825,-0.021497,-0.011386,-0.004569,-0.006631,1.0
4714,2018-06-19T20:48:17,379015170,ivanduque fcfseleccioncol juanferquinte10 fuerzas y mucha suerte desde argentina,es,POSITIVE,0.327346,0.004267,0.655717,0.012670,"['ivanduque', 'fcfseleccioncol', 'juanferquinte10', 'fuerzas', 'y', 'mucha', 'suerte', 'desde', 'argentina']","['ivanduque', 'fcfseleccioncol', 'juanferquinte10', 'fuerzas', 'mucha', 'suerte', 'argentina']","['ivanduqu', 'fcfseleccioncol', 'juanferquinte10', 'fuerz', 'much', 'suert', 'argentin']",ivanduque fcfseleccioncol juanferquinte10 fuerzas y mucha suerte desde argentina,-0.052975,0.113737,-0.046949,0.038394,0.132923,0.209435,0.082269,-0.062804,0.010835,-0.053656,0.101655,-0.040315,0.000141,-0.073440,-0.029260,0.075084,-0.111483,-0.005110,0.092333,0.020949,0.138343,0.089651,-0.028031,0.012767,0.030853,0.039336,0.029801,0.064977,0.044764,0.081725,-0.012474,-0.053643,0.015417,0.021400,-0.105155,0.119819,0.045169,0.078755,-0.053068,0.030883,0.056416,-0.005092,0.055392,0.019956,0.045628,-0.084516,0.083857,0.054090,0.010571,-0.036650,-0.046505,-0.029809,-0.077264,0.073066,-0.003076,0.002315,-0.129515,0.030657,-0.029623,0.003043,0.028775,-0.000667,0.036774,0.073651,0.093086,0.034433,-0.100212,0.017992,-0.005752,0.176074,-0.049380,0.082259,-0.119088,0.001600,-0.004346,-0.001778,-0.020699,0.045024,-0.004537,0.036926,0.053003,0.083997,0.007126,0.004815,0.000381,-0.070814,0.033139,0.084600,-0.056326,0.004177,0.026218,0.054509,0.066516,-0.011297,-0.006635,-0.028289,0.038306,-0.078549,0.003311,0.008494,2.0
255798,2018-06-16T13:06:56,-1256730622,enriquebermudez se tomó unos segundos de su tiempo y le deseo a panamá lo mejor en nuestro primer mundial fepafut eslalegion lajugadatd gracias httpstcoarggjxw2qx,es,NEUTRAL,0.689673,0.020716,0.269071,0.020540,"['enriquebermudez', 'se', 'tomó', 'unos', 'segundos', 'de', 'su', 'tiempo', 'y', 'le', 'deseo', 'a', 'panamá', 'lo', 'mejor', 'en', 'nuestro', 'primer', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'gracias', 'httpstcoarggjxw2qx']","['enriquebermudez', 'tomó', 'segundos', 'tiempo', 'deseo', 'panamá', 'mejor', 'primer', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'gracias', 'httpstcoarggjxw2qx']","['enriquebermudez', 'tom', 'segund', 'tiemp', 'dese', 'panam', 'mejor', 'prim', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'graci', 'httpstcoarggjxw2qx']",enriquebermudez se tomó unos segundos de su tiempo y le deseo a panamá lo mejor en nuestro primer mundial fepafut eslalegion lajugadatd gracias httpstcoarggjxw2qx,0.011584,0.072628,-0.080580,0.084259,0.133798,0.172166,0.063611,-0.063362,0.043030,0.013296,0.088480,-0.074086,0.017384,-0.079426,-0.012058,0.087223,-0.132494,-0.016969,0.069399,0.015764,0.087862,0.103415,-0.081591,0.014202,0.027131,0.036565,0.075011,-0.018536,0.022596,0.088980,0.005527,-0.103893,-0.015332,0.026034,-0.157355,0.056940,0.093018,0.028521,-0.053961,0.035633,0.081447,0.005088,0.090782,-0.001592,0.038897,-0.034664,0.062279,-0.014978,0.044641,-0.012816,-0.006859,-0.019898,-0.000478,0.088160,-0.006577,0.046745,-0.054783,-0.023887,0.021985,0.049629,-0.071106,0.008962,0.038463,-0.000367,0.021173,0.026625,-0.038849,-0.024515,-0.044346,0.169908,0.009813,-0.016181,-0.077721,-0.051051,0.001698,-0.003228,-0.021093,0.039607,0.012951,0.061180,0.066223,0.095677,0.000760,0.014096,0.008043,-0.016649,0.071006,0.017279,-0.100617,0.065277,-0.022160,0.021061,0.071284,-0.037081,-0.052708,-0.094497,-0.049616,-0.018925,0.012614,0.000158,1.0
188811,2018-05-14T16:46:27,651444226,miseleccionmx como no va estar optimista este tipo si con el 70 ni el 41 lo corrieron una burla la convocatoria,es,NEGATIVE,0.247989,0.548849,0.131675,0.071486,"['miseleccionmx', 'como', 'no', 'va', 'estar', 'optimista', 'este', 'tipo', 'si', 'con', 'el', '70', 'ni', 'el', '41', 'lo', 'corrieron', 'una', 'burla', 'la', 'convocatoria']","['miseleccionmx', 'va', 'optimista', 'tipo', 'si', '70', '41', 'corrieron', 'burla', 'convocatoria']","['miseleccionmx', 'va', 'optim', 'tip', 'si', '70', '41', 'corr', 'burl', 'convocatori']",miseleccionmx como no va estar optimista este tipo si con el 70 ni el 41 lo corrieron una burla la convocatoria,0.041479,0.026032,-0.065944,0.096180,0.152838,0.187604,0.109234,-0.050441,0.066923,-0.026683,0.148728,-0.073246,0.016967,-0.104371,0.035259,0.098890,-0.060566,-0.005775,0.073976,0.010041,0.118065,0.048675,-0.085861,0.024130,0.032981,0.094084,0.077959,0.002477,0.017067,0.081394,-0.023307,-0.148203,0.069944,0.002160,-0.124651,0.023923,0.082099,0.026353,-0.053300,0.073332,0.085179,0.017043,0.119843,-0.044009,0.082128,-0.032108,0.018469,0.012249,-0.008964,-0.018879,0.018671,0.006771,-0.007420,0.104694,-0.007673,0.054768,-0.065160,0.002904,0.004159,0.027826,-0.084077,-0.031280,-0.043090,0.036869,-0.020199,-0.009374,-0.032555,-0.041268,-0.048703,0.116350,0.056368,0.030932,-0.097704,-0.051632,0.021330,0.006523,-0.018077,0.000873,0.057959,0.073982,0.076530,0.044055,0.021522,0.021660,0.003672,0.036145,0.091235,0.027082,-0.094099,0.065697,0.004530,0.037694,0.088624,-0.032298,-0.044362,-0.131787,-0.024859,-0.038583,0.006688,0.037291,0.0


In [62]:
vocab_size = 282459  # number of distinct tokens
dimensions = 100  # dimensions of embedding fasttext
max_sequence_length = 35  # number of tokens per tweet

In [89]:
# Set the max_columns option to None to display all columns
pd.set_option("display.max_columns", None)

df

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,Neutral,Negative,Positive,Mixed,tokens,filtered_tokens,stemmed_tokens,tokens_as_string,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,sentiment_encoded
60151,2018-06-28T16:12:59.384000,1784107008,másunidosquenunca conlafeintacta vamoscolombia conlatricolorpuesta mundialrusia2018 vamosaganar nonosvamosarendir gracias colombia gracias x vencer el socialismo xxi gracias farid mondragon y gracias por avanzar estamosmelospaoctavos\nmlagder pilica7 mhoyos333 httpstcov2uudmircq,es,POSITIVE,0.350060,0.000235,0.644539,0.005166,"['másunidosquenunca', 'conlafeintacta', 'vamoscolombia', 'conlatricolorpuesta', 'mundialrusia2018', 'vamosaganar', 'nonosvamosarendir', 'gracias', 'colombia', 'gracias', 'x', 'vencer', 'el', 'socialismo', 'xxi', 'gracias', 'farid', 'mondragon', 'y', 'gracias', 'por', 'avanzar', 'estamosmelospaoctavos', 'mlagder', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']","['másunidosquenunca', 'conlafeintacta', 'vamoscolombia', 'conlatricolorpuesta', 'mundialrusia2018', 'vamosaganar', 'nonosvamosarendir', 'gracias', 'colombia', 'gracias', 'x', 'vencer', 'socialismo', 'xxi', 'gracias', 'farid', 'mondragon', 'gracias', 'avanzar', 'estamosmelospaoctavos', 'mlagder', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']","['masunidosquenunc', 'conlafeintact', 'vamoscolombi', 'conlatricolorpuest', 'mundialrusia2018', 'vamosagan', 'nonosvamosarend', 'graci', 'colombi', 'graci', 'x', 'venc', 'social', 'xxi', 'graci', 'far', 'mondragon', 'graci', 'avanz', 'estamosmelospaoctav', 'mlagd', 'pilica7', 'mhoyos333', 'httpstcov2uudmircq']",másunidosquenunca conlafeintacta vamoscolombia conlatricolorpuesta mundialrusia2018 vamosaganar nonosvamosarendir gracias colombia gracias x vencer el socialismo xxi gracias farid mondragon y gracias por avanzar estamosmelospaoctavos mlagder pilica7 mhoyos333 httpstcov2uudmircq,-0.029059,0.089238,-0.099062,0.060049,0.159505,0.221087,0.059089,-0.063663,0.054963,0.018248,0.054835,-0.024155,-0.028379,-0.075933,0.009605,0.123610,-0.075535,0.003957,0.056147,0.054395,0.069691,0.078577,-0.029189,-0.030220,0.070053,-0.000857,0.061832,-0.003341,0.054546,0.127368,-0.057978,0.000042,-0.023296,-0.004476,-0.012103,0.025943,0.053024,0.035395,-0.064056,0.107801,0.056701,-0.022982,0.092009,-0.009205,0.050853,-0.086181,0.092590,-0.032310,-0.062916,-0.067546,-0.064600,-0.068724,-0.021663,0.103924,-0.016985,0.005648,-0.071782,0.060273,0.007180,0.029772,0.006658,0.047375,-0.005312,-0.008307,0.084346,0.022886,-0.063585,-0.019063,-0.054511,0.139538,-0.012181,0.016962,-0.060934,-0.048208,0.066808,0.038369,-0.007719,0.060613,0.009058,0.046057,-0.009863,0.079630,0.017453,-0.006552,0.037371,-0.099145,0.089840,0.027956,-0.054526,0.010925,-0.047625,0.024872,0.041494,-0.017940,-0.021979,-0.050140,0.054040,-0.049168,-0.055790,0.049390,2.0
70375,2018-06-22T12:18:11,-1017774079,para cuándo la fcfseleccioncol jugándole como costa rica a brasil,es,NEUTRAL,0.978532,0.006588,0.012492,0.002388,"['para', 'cuándo', 'la', 'fcfseleccioncol', 'jugándole', 'como', 'costa', 'rica', 'a', 'brasil']","['cuándo', 'fcfseleccioncol', 'jugándole', 'costa', 'rica', 'brasil']","['cuand', 'fcfseleccioncol', 'jug', 'cost', 'ric', 'brasil']",para cuándo la fcfseleccioncol jugándole como costa rica a brasil,-0.025459,0.043306,-0.114674,0.107244,0.160624,0.195959,0.105956,-0.005502,0.058612,0.012669,0.129113,-0.093427,0.019399,-0.099694,0.053587,0.064284,-0.040992,-0.008989,0.063830,0.034736,0.054009,0.078707,-0.055019,0.002308,0.030089,0.043087,0.013144,-0.019816,0.046936,0.094541,-0.015879,-0.136571,0.051188,0.021494,-0.151703,0.084773,0.096507,-0.050720,-0.065546,0.039195,0.113884,-0.014540,0.049530,0.076624,0.025680,-0.085711,0.062923,0.006464,0.020467,-0.044417,-0.012207,-0.002950,0.023108,0.070480,-0.024376,0.044859,-0.110032,0.029538,-0.048885,0.060548,-0.048756,0.009855,-0.026975,0.031436,0.013617,-0.042241,-0.061825,-0.073113,-0.021133,0.176416,0.036064,0.068783,-0.089930,0.024474,0.020292,0.024166,-0.036761,-0.012150,0.006574,0.085138,0.089706,0.098435,-0.040762,0.068768,-0.022880,-0.020912,0.018412,0.052602,-0.076928,-0.003735,0.020007,0.018688,0.072371,0.000398,-0.049879,-0.071670,-0.006112,-0.068310,0.022114,0.012613,1.0
232645,2018-06-28T01:54:36.933000,815239173,sondeobahialinda fcfseleccioncol avanza a octavos de final de fifaworldcup_es gracias por participar si das retuit otras personas podrán opinar,es,NEUTRAL,0.751145,0.015220,0.207554,0.026081,"['sondeobahialinda', 'fcfseleccioncol', 'avanza', 'a', 'octavos', 'de', 'final', 'de', 'fifaworldcup_es', 'gracias', 'por', 'participar', 'si', 'das', 'retuit', 'otras', 'personas', 'podrán', 'opinar']","['sondeobahialinda', 'fcfseleccioncol', 'avanza', 'octavos', 'final', 'fifaworldcup_es', 'gracias', 'participar', 'si', 'das', 'retuit', 'personas', 'podrán', 'opinar']","['sondeobahialind', 'fcfseleccioncol', 'avanz', 'octav', 'final', 'fifaworldcup_', 'graci', 'particip', 'si', 'das', 'retuit', 'person', 'podran', 'opin']",sondeobahialinda fcfseleccioncol avanza a octavos de final de fifaworldcup_es gracias por participar si das retuit otras personas podrán opinar,-0.038498,0.092658,-0.085193,0.074903,0.172392,0.139334,0.070888,-0.030142,0.070441,-0.007076,0.123035,-0.087316,-0.011625,-0.089243,0.010598,0.139434,-0.112833,-0.029454,0.050642,0.014475,0.066005,0.074343,-0.020879,0.026893,0.072591,0.039455,0.064366,-0.005532,0.030542,0.076821,-0.008554,-0.079624,-0.000779,-0.010920,-0.075749,0.021891,0.087513,0.009929,-0.095223,0.062236,0.068670,0.030537,0.100545,0.000777,0.029045,-0.034543,0.024093,0.008222,-0.010788,-0.022825,-0.058130,-0.018610,0.020422,0.075978,0.000183,0.039558,-0.041861,0.016465,0.040309,0.075201,-0.011622,0.026906,-0.018503,0.041545,0.053020,0.008578,-0.040196,-0.020956,-0.058936,0.129149,0.041503,0.061106,-0.102242,-0.063055,0.041232,0.042524,0.004540,-0.033103,-0.002749,0.065801,0.086728,0.057673,0.038678,-0.001943,0.012085,-0.008174,0.107566,0.039698,-0.063723,0.004585,-0.051668,0.010095,0.108617,0.015707,-0.012517,-0.074720,-0.003046,-0.029250,-0.036150,0.031267,1.0
14101,2018-05-27T04:16:46,-1399468032,briansa28060006 miseleccionmx algo parecido decían en aquel partido de tercera ronda de fa cup hereford enfrentaba al poderoso newcastle de macdonald lo demás es historia httpstcopffkchpgn2,es,NEUTRAL,0.807122,0.038166,0.136453,0.018259,"['briansa28060006', 'miseleccionmx', 'algo', 'parecido', 'decían', 'en', 'aquel', 'partido', 'de', 'tercera', 'ronda', 'de', 'fa', 'cup', 'hereford', 'enfrentaba', 'al', 'poderoso', 'newcastle', 'de', 'macdonald', 'lo', 'demás', 'es', 'historia', 'httpstcopffkchpgn2']","['briansa28060006', 'miseleccionmx', 'parecido', 'decían', 'aquel', 'partido', 'tercera', 'ronda', 'fa', 'cup', 'hereford', 'enfrentaba', 'poderoso', 'newcastle', 'macdonald', 'demás', 'historia', 'httpstcopffkchpgn2']","['briansa28060006', 'miseleccionmx', 'parec', 'dec', 'aquel', 'part', 'tercer', 'rond', 'fa', 'cup', 'hereford', 'enfrent', 'poder', 'newcastl', 'macdonald', 'demas', 'histori', 'httpstcopffkchpgn2']",briansa28060006 miseleccionmx algo parecido decían en aquel partido de tercera ronda de fa cup hereford enfrentaba al poderoso newcastle de macdonald lo demás es historia httpstcopffkchpgn2,0.003865,0.049860,-0.099769,0.056782,0.151591,0.158307,0.055609,-0.054735,0.023316,-0.007576,0.103678,-0.028259,0.025342,-0.065894,0.026852,0.105057,-0.055998,-0.000320,0.050555,0.033259,0.037757,0.074706,-0.077085,-0.004926,0.057498,0.053739,0.097696,-0.061206,0.021930,0.113087,-0.000339,-0.142970,0.013738,0.006194,-0.120748,0.045394,0.077567,0.022510,-0.050040,0.051580,0.116538,-0.019421,0.109968,-0.027044,0.008548,-0.064142,0.002302,-0.024634,0.015827,-0.003306,-0.024176,-0.001803,0.034689,0.123042,-0.024925,0.073728,-0.053777,-0.036464,0.030956,0.076912,-0.080225,0.029142,0.015844,0.029858,0.014861,-0.004336,-0.010926,-0.009192,-0.038576,0.161950,0.019742,0.045517,-0.070968,-0.052140,0.047850,0.011796,-0.038979,0.015676,0.024589,0.071631,0.039785,0.038598,0.001736,0.020920,-0.041757,0.026955,0.077167,0.029651,-0.091344,0.032060,-0.028149,0.029808,0.077089,-0.008362,-0.057512,-0.081587,-0.023911,-0.031327,-0.024364,0.022909,1.0
25314,2018-06-25T17:34:21,-216539134,creo que llegamos acuartos lo siento pero creo que la sefutbol se cae antes de semis y tú liberaloquesientes en nuestracopa,es,NEUTRAL,0.404293,0.291237,0.103653,0.200817,"['creo', 'que', 'llegamos', 'acuartos', 'lo', 'siento', 'pero', 'creo', 'que', 'la', 'sefutbol', 'se', 'cae', 'antes', 'de', 'semis', 'y', 'tú', 'liberaloquesientes', 'en', 'nuestracopa']","['creo', 'llegamos', 'acuartos', 'siento', 'creo', 'sefutbol', 'cae', 'semis', 'liberaloquesientes', 'nuestracopa']","['cre', 'lleg', 'acuart', 'sient', 'cre', 'sefutbol', 'cae', 'semis', 'liberaloquesient', 'nuestracop']",creo que llegamos acuartos lo siento pero creo que la sefutbol se cae antes de semis y tú liberaloquesientes en nuestracopa,-0.012956,0.038656,-0.029059,0.085300,0.171135,0.203375,0.082519,-0.101758,0.047706,-0.034739,0.118921,-0.068021,-0.000702,-0.054345,0.027981,0.107607,-0.121319,-0.020568,0.040970,0.024316,0.084084,0.007137,-0.062161,0.006358,0.029319,0.052733,0.085000,-0.047910,-0.003158,0.132708,-0.029517,-0.159212,-0.016833,0.007452,-0.179993,0.048372,0.054405,0.057704,-0.052625,0.032404,0.051158,-0.004924,0.124484,-0.042853,-0.004547,-0.029379,0.018346,-0.014134,0.012154,-0.020898,-0.006938,-0.036874,0.042938,0.129090,-0.009299,0.026190,-0.081128,-0.020999,0.090503,0.061707,-0.021594,-0.029597,-0.071597,0.078393,-0.071793,0.026715,-0.102311,-0.084775,-0.012751,0.126799,0.042978,0.037369,-0.037034,-0.026358,0.023738,-0.007208,-0.030593,0.012254,0.016681,0.075934,0.089524,0.023083,0.016865,0.024628,0.029534,0.033512,0.057358,-0.002124,-0.002758,0.076347,0.030757,-0.002348,0.112446,-0.056201,-0.040363,-0.169861,-0.017810,-0.018384,-0.044457,0.015030,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326267,2018-05-16T02:47:49,-145313791,el domingo 27 argentina entrenará a puertas abiertas en el estadio amalfitani gran iniciativa para que ingresen alumnos de las escuelas de caba y gran bs as httpstcowvp2azjqyh,es,NEUTRAL,0.880786,0.059434,0.050053,0.009727,"['el', 'domingo', '27', 'argentina', 'entrenará', 'a', 'puertas', 'abiertas', 'en', 'el', 'estadio', 'amalfitani', 'gran', 'iniciativa', 'para', 'que', 'ingresen', 'alumnos', 'de', 'las', 'escuelas', 'de', 'caba', 'y', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']","['domingo', '27', 'argentina', 'entrenará', 'puertas', 'abiertas', 'estadio', 'amalfitani', 'gran', 'iniciativa', 'ingresen', 'alumnos', 'escuelas', 'caba', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']","['doming', '27', 'argentin', 'entren', 'puert', 'abiert', 'estadi', 'amalfitani', 'gran', 'inici', 'ingres', 'alumn', 'escuel', 'cab', 'gran', 'bs', 'as', 'httpstcowvp2azjqyh']",el domingo 27 argentina entrenará a puertas abiertas en el estadio amalfitani gran iniciativa para que ingresen alumnos de las escuelas de caba y gran bs as httpstcowvp2azjqyh,-0.051549,0.061011,-0.063376,0.041306,0.095232,0.132294,0.101941,-0.089577,0.025458,0.011424,0.121157,-0.025566,-0.025312,-0.072683,0.026776,0.142752,-0.075512,0.033897,0.070762,0.018415,0.086743,0.122432,-0.073439,0.008200,0.006707,0.047511,0.084368,-0.058080,0.037879,0.108489,-0.017745,-0.097516,0.048869,0.045748,-0.146609,0.058247,0.078037,0.067279,-0.093812,0.037309,0.038848,-0.017222,0.093509,0.017076,0.057393,-0.086929,0.060876,0.019618,-0.016461,-0.059271,-0.029777,0.057963,0.080524,0.069687,-0.007816,0.028449,-0.058005,-0.012977,0.008556,0.067330,-0.053717,0.053006,0.028911,0.014186,0.039478,-0.008043,-0.019827,-0.015984,-0.020956,0.193638,0.058800,0.018315,-0.062159,-0.072707,0.040598,0.007520,-0.032723,0.046036,0.059565,0.055371,0.102447,0.104503,0.039041,-0.013246,-0.019756,0.044394,0.019204,0.034734,-0.084133,-0.010270,0.016535,-0.009201,0.097082,0.011580,0.022317,-0.077825,-0.021497,-0.011386,-0.004569,-0.006631,1.0
4714,2018-06-19T20:48:17,379015170,ivanduque fcfseleccioncol juanferquinte10 fuerzas y mucha suerte desde argentina,es,POSITIVE,0.327346,0.004267,0.655717,0.012670,"['ivanduque', 'fcfseleccioncol', 'juanferquinte10', 'fuerzas', 'y', 'mucha', 'suerte', 'desde', 'argentina']","['ivanduque', 'fcfseleccioncol', 'juanferquinte10', 'fuerzas', 'mucha', 'suerte', 'argentina']","['ivanduqu', 'fcfseleccioncol', 'juanferquinte10', 'fuerz', 'much', 'suert', 'argentin']",ivanduque fcfseleccioncol juanferquinte10 fuerzas y mucha suerte desde argentina,-0.052975,0.113737,-0.046949,0.038394,0.132923,0.209435,0.082269,-0.062804,0.010835,-0.053656,0.101655,-0.040315,0.000141,-0.073440,-0.029260,0.075084,-0.111483,-0.005110,0.092333,0.020949,0.138343,0.089651,-0.028031,0.012767,0.030853,0.039336,0.029801,0.064977,0.044764,0.081725,-0.012474,-0.053643,0.015417,0.021400,-0.105155,0.119819,0.045169,0.078755,-0.053068,0.030883,0.056416,-0.005092,0.055392,0.019956,0.045628,-0.084516,0.083857,0.054090,0.010571,-0.036650,-0.046505,-0.029809,-0.077264,0.073066,-0.003076,0.002315,-0.129515,0.030657,-0.029623,0.003043,0.028775,-0.000667,0.036774,0.073651,0.093086,0.034433,-0.100212,0.017992,-0.005752,0.176074,-0.049380,0.082259,-0.119088,0.001600,-0.004346,-0.001778,-0.020699,0.045024,-0.004537,0.036926,0.053003,0.083997,0.007126,0.004815,0.000381,-0.070814,0.033139,0.084600,-0.056326,0.004177,0.026218,0.054509,0.066516,-0.011297,-0.006635,-0.028289,0.038306,-0.078549,0.003311,0.008494,2.0
255798,2018-06-16T13:06:56,-1256730622,enriquebermudez se tomó unos segundos de su tiempo y le deseo a panamá lo mejor en nuestro primer mundial fepafut eslalegion lajugadatd gracias httpstcoarggjxw2qx,es,NEUTRAL,0.689673,0.020716,0.269071,0.020540,"['enriquebermudez', 'se', 'tomó', 'unos', 'segundos', 'de', 'su', 'tiempo', 'y', 'le', 'deseo', 'a', 'panamá', 'lo', 'mejor', 'en', 'nuestro', 'primer', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'gracias', 'httpstcoarggjxw2qx']","['enriquebermudez', 'tomó', 'segundos', 'tiempo', 'deseo', 'panamá', 'mejor', 'primer', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'gracias', 'httpstcoarggjxw2qx']","['enriquebermudez', 'tom', 'segund', 'tiemp', 'dese', 'panam', 'mejor', 'prim', 'mundial', 'fepafut', 'eslalegion', 'lajugadatd', 'graci', 'httpstcoarggjxw2qx']",enriquebermudez se tomó unos segundos de su tiempo y le deseo a panamá lo mejor en nuestro primer mundial fepafut eslalegion lajugadatd gracias httpstcoarggjxw2qx,0.011584,0.072628,-0.080580,0.084259,0.133798,0.172166,0.063611,-0.063362,0.043030,0.013296,0.088480,-0.074086,0.017384,-0.079426,-0.012058,0.087223,-0.132494,-0.016969,0.069399,0.015764,0.087862,0.103415,-0.081591,0.014202,0.027131,0.036565,0.075011,-0.018536,0.022596,0.088980,0.005527,-0.103893,-0.015332,0.026034,-0.157355,0.056940,0.093018,0.028521,-0.053961,0.035633,0.081447,0.005088,0.090782,-0.001592,0.038897,-0.034664,0.062279,-0.014978,0.044641,-0.012816,-0.006859,-0.019898,-0.000478,0.088160,-0.006577,0.046745,-0.054783,-0.023887,0.021985,0.049629,-0.071106,0.008962,0.038463,-0.000367,0.021173,0.026625,-0.038849,-0.024515,-0.044346,0.169908,0.009813,-0.016181,-0.077721,-0.051051,0.001698,-0.003228,-0.021093,0.039607,0.012951,0.061180,0.066223,0.095677,0.000760,0.014096,0.008043,-0.016649,0.071006,0.017279,-0.100617,0.065277,-0.022160,0.021061,0.071284,-0.037081,-0.052708,-0.094497,-0.049616,-0.018925,0.012614,0.000158,1.0
188811,2018-05-14T16:46:27,651444226,miseleccionmx como no va estar optimista este tipo si con el 70 ni el 41 lo corrieron una burla la convocatoria,es,NEGATIVE,0.247989,0.548849,0.131675,0.071486,"['miseleccionmx', 'como', 'no', 'va', 'estar', 'optimista', 'este', 'tipo', 'si', 'con', 'el', '70', 'ni', 'el', '41', 'lo', 'corrieron', 'una', 'burla', 'la', 'convocatoria']","['miseleccionmx', 'va', 'optimista', 'tipo', 'si', '70', '41', 'corrieron', 'burla', 'convocatoria']","['miseleccionmx', 'va', 'optim', 'tip', 'si', '70', '41', 'corr', 'burl', 'convocatori']",miseleccionmx como no va estar optimista este tipo si con el 70 ni el 41 lo corrieron una burla la convocatoria,0.041479,0.026032,-0.065944,0.096180,0.152838,0.187604,0.109234,-0.050441,0.066923,-0.026683,0.148728,-0.073246,0.016967,-0.104371,0.035259,0.098890,-0.060566,-0.005775,0.073976,0.010041,0.118065,0.048675,-0.085861,0.024130,0.032981,0.094084,0.077959,0.002477,0.017067,0.081394,-0.023307,-0.148203,0.069944,0.002160,-0.124651,0.023923,0.082099,0.026353,-0.053300,0.073332,0.085179,0.017043,0.119843,-0.044009,0.082128,-0.032108,0.018469,0.012249,-0.008964,-0.018879,0.018671,0.006771,-0.007420,0.104694,-0.007673,0.054768,-0.065160,0.002904,0.004159,0.027826,-0.084077,-0.031280,-0.043090,0.036869,-0.020199,-0.009374,-0.032555,-0.041268,-0.048703,0.116350,0.056368,0.030932,-0.097704,-0.051632,0.021330,0.006523,-0.018077,0.000873,0.057959,0.073982,0.076530,0.044055,0.021522,0.021660,0.003672,0.036145,0.091235,0.027082,-0.094099,0.065697,0.004530,0.037694,0.088624,-0.032298,-0.044362,-0.131787,-0.024859,-0.038583,0.006688,0.037291,0.0


In [90]:
# Extract X (features) and y (target)
X = df.iloc[:, 13:-1].to_numpy()
y = df["sentiment_encoded"].to_numpy()

# Make sure y is in the correct shape
y = np.reshape(y, (-1, 1))

In [91]:
y

array([[2.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [0.]])

In [92]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [93]:
X_train

array([[-0.02098395,  0.08056071, -0.03259998, ..., -0.05141212,
        -0.00999504, -0.01927253],
       [-0.04503735,  0.07792307, -0.08982849, ...,  0.01504025,
        -0.04177724,  0.08012953],
       [ 0.03346503,  0.07148388, -0.06787509, ..., -0.02803733,
         0.00856456, -0.00141171],
       ...,
       [-0.02056255,  0.08412792, -0.14409721, ...,  0.00446333,
        -0.03497283,  0.05015467],
       [-0.01023732,  0.09164905, -0.05616085, ..., -0.01642906,
         0.0237581 , -0.00789321],
       [-0.00020143,  0.02998669,  0.02280423, ..., -0.06988493,
        -0.01309495,  0.06138528]])

In [94]:
y_train

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [95]:
print(y_train[:1000])  # Print the first 10 label values

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [2.]
 [2.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [2.]
 [1.]
 [2.]
 [1.]
 [2.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [0.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [2.]
 [1.]
 [0.]
 [1.]
 [1.]
 [2.]
 [1.]
 [0.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [0.]
 [2.]
 [1.]
 [2.]
 [1.]
 [1.

In [96]:
# Initialize a Tokenizer
tokenizer = Tokenizer()
# Fit the tokenizer on your tokenized text data
tokenizer.fit_on_texts(df["tokens"])

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the zero padding

# Print the vocabulary size
print("Vocabulary size (input_dim):", vocab_size)

Vocabulary size (input_dim): 318203


In [97]:
# Define the model
model = Sequential()
model.add(
    Embedding(
        input_dim=vocab_size,
        output_dim=50,
        input_length=100,
    )
)
model.add(LSTM(100))
model.add(Dense(3, activation="softmax"))

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Compile the model
model.compile(
    loss=SparseCategoricalCrossentropy(),
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss}, Test accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10