In [2]:
#Importing everything we need for EDA. Black for linting reasons.
import os
import black
import jupyter_black
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.svm import SVC  # Mock model for testing 
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET
import pandas as pd
import nltk
import json
import fasttext
pd.options.mode.chained_assignment = None  # default='warn'
nltk.download("stopwords")
nltk.download("punkt")
jupyter_black.load(
    lab=False,
    line_length=80,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cacu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Cacu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
DEBUG:jupyter_black:config: {'line_length': 80, 'target_versions': {<TargetVersion.PY310: 10>}}


In [None]:
csv_file_path = r"C:\Users\Cacu\Desktop\Universidad\Trabajo_Final\DataSets\open-dataset-for-sentiment-analysis-master\betsentiment-ES-tweets-sentiment-worldcup.csv"


# Function to extract sentiment scores from the json str
def extract_sentiment_scores(json_str):
    sentiment_data = json.loads(json_str)
    return (
        sentiment_data["Neutral"],
        sentiment_data["Negative"],
        sentiment_data["Positive"],
        sentiment_data["Mixed"],
    )


# Define data types
dtype_dict = {
    "tweet_date_created": str,
    "tweet_id": int,
    "tweet_text": str,
    "language": str,
    "sentiment": str,
}

try:
    df = pd.read_csv(csv_file_path, encoding="utf-8", dtype=dtype_dict)
except UnicodeDecodeError:
    df = pd.read_csv(csv_file_path, encoding="latin-1", dtype=dtype_dict)

sentiment_scores_list = df["sentiment_score"].map(extract_sentiment_scores)

sentiment_scores_df = pd.DataFrame(
    sentiment_scores_list.tolist(),
    columns=["Neutral", "Negative", "Positive", "Mixed"],
)

df = pd.concat([df, sentiment_scores_df], axis=1)

df.drop("sentiment_score", axis=1, inplace=True)

In [None]:
df

In [None]:
# Read the stopwords from the .txt file and store them in a list
custom_stopwords_file = (
    r"C:\Users\Cacu\Desktop\Universidad\Trabajo_Final\utilities\stopwords.txt"
)
with open(custom_stopwords_file, "r", encoding="utf-8") as file:
    custom_stopwords_list = [line.strip() for line in file]

# Create an empty set to hold the stopwords
custom_stopwords_set = set()

# Add the stopwords from the list to the set
custom_stopwords_set.update(custom_stopwords_list)

In [None]:
# Specify the Snowball stemmer for Spanish
stemmer = SnowballStemmer("spanish")

# Convert NaN values to an empty string
df["tweet_text"] = df["tweet_text"].fillna("")

# Lowercasing
df["tweet_text"] = df["tweet_text"].str.lower()

# Removing punctuation
df["tweet_text"] = df["tweet_text"].str.replace("[^\w\s]", "")

# Tokenization
df["tokens"] = df["tweet_text"].apply(word_tokenize)

# Removing stop words
stop_words = set(stopwords.words("spanish"))
df["filtered_tokens"] = df["tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# Stemming
df["stemmed_tokens"] = df["filtered_tokens"].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

In [None]:
df

## 1. Embedding

In [None]:
df['tokens_as_string'] = df['tokens'].apply(' '.join)
# Save the stemmed tokens to a text file (one sentence per line)
with open('tokens.txt', 'w') as f:
    f.write('\n'.join(df['tokens_as_string']))

In [None]:
# Train a FastText model on the stemmed tokens
model = fasttext.train_unsupervised('tokens.txt', model='skipgram')

# Create tweet-level embeddings using the trained model
embeddings = []

In [None]:
for tokens in df['tokens_as_string']:
    vector = model.get_sentence_vector(tokens)
    embeddings.append(vector)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(embeddings)


In [None]:
# Concatenate the embeddings DataFrame with the original dataset
df_worldcup_embeddings = pd.concat([df, embedding_df], axis=1)

# Save the dataset with embeddings to a new CSV file
df_worldcup_embeddings.to_csv('2018_dataset_with_embeddings.csv', index=False)

In [None]:
df_worldcup_embeddings

In [3]:
nrows_to_load = 100000  # Adjust this to the desired subset size

# FYI: start from here - csv already generated
df = pd.read_csv("2018_dataset_with_embeddings.csv", nrows=nrows_to_load)

# Inspect the first few rows of the dataset
df

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,Neutral,Negative,Positive,Mixed,tokens,...,90,91,92,93,94,95,96,97,98,99
0,2018-06-17T09:47:48,-136978431,joseameadek miseleccionmx a pero si estás en todo menos en tu campaña y todavía te preguntas porque vas en tercer lugar,es,NEUTRAL,0.597391,0.205260,0.085440,0.111909,"['joseameadek', 'miseleccionmx', 'a', 'pero', 'si', 'estás', 'en', 'todo', 'menos', 'en', 'tu', 'campaña', 'y', 'todavía', 'te', 'preguntas', 'porque', 'vas', 'en', 'tercer', 'lugar']",...,0.031300,0.058071,0.086869,0.008025,-0.057736,-0.109543,-0.057895,-0.055777,-0.044339,0.010082
1,2018-06-28T06:00:16.360000,-1076408320,mariopereyradt\ntenemos que jugarle a francia con el mismo equipo httpstco15ccsymtoi \n\nsomosargentina cadena3mundial httpstcoanuzmjdkar,es,NEUTRAL,0.878757,0.058990,0.049931,0.012322,"['mariopereyradt', 'tenemos', 'que', 'jugarle', 'a', 'francia', 'con', 'el', 'mismo', 'equipo', 'httpstco15ccsymtoi', 'somosargentina', 'cadena3mundial', 'httpstcoanuzmjdkar']",...,0.025621,0.027419,0.142908,0.004100,-0.038275,-0.059434,-0.024883,-0.064664,-0.022944,0.052050
2,2018-06-07T22:07:43,1008168960,miseleccionmx no me pidas eso mi selección sí a eso voy a los estadios a sacar el fua,es,NEGATIVE,0.315568,0.556793,0.056737,0.070902,"['miseleccionmx', 'no', 'me', 'pidas', 'eso', 'mi', 'selección', 'sí', 'a', 'eso', 'voy', 'a', 'los', 'estadios', 'a', 'sacar', 'el', 'fua']",...,0.004142,0.075021,0.072423,-0.052099,-0.049917,-0.081705,-0.069447,-0.050849,-0.051615,-0.011147
3,2018-05-31T21:02:10,-1550348286,si llega a ser la despedida no será la mejor\npero casi que quiero obligar a todos a mirar 4 años atrás y ver todo lo que atrás de este tipo lograron jugadores dirigentes y todos los que forman parte de argentina \ngracias por hacernos mejores y felices personas httpstcoamtgw75rdq,es,POSITIVE,0.244512,0.005207,0.686508,0.063773,"['si', 'llega', 'a', 'ser', 'la', 'despedida', 'no', 'será', 'la', 'mejor', 'pero', 'casi', 'que', 'quiero', 'obligar', 'a', 'todos', 'a', 'mirar', '4', 'años', 'atrás', 'y', 'ver', 'todo', 'lo', 'que', 'atrás', 'de', 'este', 'tipo', 'lograron', 'jugadores', 'dirigentes', 'y', 'todos', 'los', 'que', 'forman', 'parte', 'de', 'argentina', 'gracias', 'por', 'hacernos', 'mejores', 'y', 'felices', 'personas', 'httpstcoamtgw75rdq']",...,0.020229,0.032681,0.091121,-0.013145,-0.069782,-0.078864,-0.011197,-0.059799,-0.017021,0.020561
4,2018-06-26T11:02:06,-1604923392,no se les olvide que nuestro trabajo es construir la dictadura del proletariado\nvamoscolombia,es,NEUTRAL,0.424387,0.418878,0.085919,0.070817,"['no', 'se', 'les', 'olvide', 'que', 'nuestro', 'trabajo', 'es', 'construir', 'la', 'dictadura', 'del', 'proletariado', 'vamoscolombia']",...,-0.003167,0.053939,0.091963,-0.024918,-0.045017,-0.107317,0.037208,-0.101381,0.019652,0.031471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2018-06-23T15:04:13,-489340926,me llena de emoción escuchar a todo el estadio cantar apoyando a la miseleccionmx,es,POSITIVE,0.086356,0.004706,0.897557,0.011380,"['me', 'llena', 'de', 'emoción', 'escuchar', 'a', 'todo', 'el', 'estadio', 'cantar', 'apoyando', 'a', 'la', 'miseleccionmx']",...,0.013920,0.031754,0.061160,-0.038237,-0.031001,-0.065419,-0.076649,-0.034038,-0.026635,-0.027900
99996,2018-06-25T04:47:40,290942982,adrianamonsalve miseleccionmx canal_estrellas td_deportes wao que linda,es,NEUTRAL,0.586192,0.011301,0.381318,0.021189,"['adrianamonsalve', 'miseleccionmx', 'canal_estrellas', 'td_deportes', 'wao', 'que', 'linda']",...,-0.003515,0.066548,0.043895,-0.122540,-0.106270,-0.082331,0.011476,0.059558,-0.062486,0.017741
99997,2018-06-25T04:47:40.478000,290942982,adrianamonsalve miseleccionmx canal_estrellas td_deportes wao que linda,es,NEUTRAL,0.586192,0.011301,0.381318,0.021189,"['adrianamonsalve', 'miseleccionmx', 'canal_estrellas', 'td_deportes', 'wao', 'que', 'linda']",...,-0.003515,0.066548,0.043895,-0.122540,-0.106270,-0.082331,0.011476,0.059558,-0.062486,0.017741
99998,2018-06-17T22:25:45,1947598849,hoy más que nunca vivamexico\n\nhttpstcoeobsxn0ftv\n\nrusia2018 rusia rus mex alemaniaméxico alemaniavsmexico worldcup mexicovsalemania vamosmexico nadanosdetiene nadaesimposible felizdiadelpadre sisepudo youtubemexico youtubeespanol miseleccionmx fifaworldcup httpstcoxuun0i1myv,es,NEUTRAL,0.855919,0.095914,0.027360,0.020807,"['hoy', 'más', 'que', 'nunca', 'vivamexico', 'httpstcoeobsxn0ftv', 'rusia2018', 'rusia', 'rus', 'mex', 'alemaniaméxico', 'alemaniavsmexico', 'worldcup', 'mexicovsalemania', 'vamosmexico', 'nadanosdetiene', 'nadaesimposible', 'felizdiadelpadre', 'sisepudo', 'youtubemexico', 'youtubeespanol', 'miseleccionmx', 'fifaworldcup', 'httpstcoxuun0i1myv']",...,0.005763,0.029519,0.010164,-0.039462,-0.023197,-0.074982,0.002951,-0.050439,-0.048486,0.000843


## Preparing for training

In [4]:
print(df.isnull().sum())

# Remove rows with missing values
df.dropna(inplace=True)

# Remove duplicates
df.drop_duplicates(subset=["tweet_text"], keep="first", inplace=True)

tweet_date_created    0
tweet_id              0
tweet_text            0
language              0
sentiment             0
                     ..
95                    0
96                    0
97                    0
98                    0
99                    0
Length: 113, dtype: int64


In [5]:
df.count()

tweet_date_created    84654
tweet_id              84654
tweet_text            84654
language              84654
sentiment             84654
                      ...  
95                    84654
96                    84654
97                    84654
98                    84654
99                    84654
Length: 113, dtype: int64

In [6]:
label_mapping = {"POSITIVE": 2, "NEUTRAL": 1, "NEGATIVE": 0}
df["sentiment_encoded"] = df["sentiment"].map(label_mapping)

In [7]:
# Remove rows with missing values
df.dropna(inplace=True)

In [8]:
embedding_columns = [str(i) for i in range(1, 100)]  # Columns '1' to '100'
X = df[embedding_columns].to_numpy()
Y = df["sentiment_encoded"]

In [None]:
# 1- stratified split:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, df["sentiment_encoded"], test_size=0.2, random_state=42, stratify=Y
)
print("1- Split dataset done")


# 2- cross-val on trian set
model = SVC(kernel="linear", C=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 3- 5-fold cross-val on train set
scores = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1)

print("Cross-Validation Scores:", scores)

# 4: mock model to test split
final_model = SVC(kernel="linear", C=1)
final_model.fit(X_train, y_train)

test_accuracy = final_model.score(X_test, y_test)
print("Final Model Test Accuracy:", test_accuracy)

Cross-Validation Scores: [0.76753701 0.77088448 0.76701629 0.76880161 0.76826365]
