In [3]:
from datetime import datetime, timedelta

import pandas as pd
import plotly.express as px

### Load twitters

In [6]:
# Read data from 23/10/2023 to 31/10/2023
# Define the start and end dates
start_date = datetime(2023, 10, 23)
end_date = datetime(2023, 10, 31)

# Generate the dates using a list comprehension
dates = [(start_date + timedelta(days=d)).strftime("%d_%m_%Y") for d in range((end_date - start_date).days + 1)]

# Path with json files
path_json = 'data/'
columns_to_read = ["tweet_id", "full_text"]
new_column_names = {"tweet_id" : "id", "full_text": "text"}

# Initialize an empty list to store the dataframes
dfs = []

for date in dates:
  timestamp = pd.to_datetime(date, format='%d_%m_%Y')

  df_massa =  pd.read_json(path_json + f"{date}_massa.json")[columns_to_read].rename(columns=new_column_names)
  df_milei =  pd.read_json(path_json + f"{date}_milei.json")[columns_to_read].rename(columns=new_column_names)

  # Add the timestamp column to the existing DataFrame
  df_massa['timestamp'] = timestamp
  df_milei['timestamp'] = timestamp

   # Append the dataframes to the list
  dfs.append(df_massa)
  dfs.append(df_milei)


# Read data from 1/11/2023 to 18/11/2023 except 09/11/2023
# Define the start and end dates
start_date = datetime(2023, 11, 1)
end_date = datetime(2023, 11, 18)

# Generate the dates using a list comprehension
dates = [(start_date + timedelta(days=d)).strftime("%d_%m_%Y") for d in range((end_date - start_date).days + 1)]
dates = [date for date in dates if date != "09_11_2023"]

# Path with json files
columns_to_read = ["id", "text", "timestamp"]

for date in dates:
  df_massa =  pd.read_json(path_json + f"{date}_massa.json")[columns_to_read]
  df_milei =  pd.read_json(path_json + f"{date}_milei.json")[columns_to_read]

   # Append the dataframes to the list
  dfs.append(df_massa)
  dfs.append(df_milei)

# Read data from 09/11/2023
# Define the path to your JSON file
path_massa = 'data/09_11_2023_massa.json'
path_milei = 'data/09_11_2023_milei.json'

# Read data frame
columns_to_read = ["conversation_id_str", "full_text", "created_at"]
new_column_names = {"conversation_id_str" : "id", "full_text": "text", "created_at" : "timestamp"}

df_massa = pd.read_json(path_massa)
df_massa = df_massa[columns_to_read].rename(columns=new_column_names) # choose columns
df_milei = pd.read_json(path_milei)
df_milei = df_milei[columns_to_read].rename(columns=new_column_names) # choose columns
# Append the dataframes to the list
dfs.append(df_massa)
dfs.append(df_milei)


# Concatenate the dataframes vertically
df = pd.concat(dfs, ignore_index=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

In [7]:
df.head()

Unnamed: 0,id,text,timestamp
0,1716605183940022784,Massa ya ganó\n\nAhora hay que recuperar los ...,2023-10-23 00:00:00+00:00
1,1716605147164373248,Me burlaría de los Argentinos por votar por Se...,2023-10-23 00:00:00+00:00
2,1716601976471843072,Apoyar a Sergio Massa es apoyar al Kirchnerism...,2023-10-23 00:00:00+00:00
3,1716601291114189056,"Si yo quisiera que el país colapse, diría que ...",2023-10-23 00:00:00+00:00
4,1716599848156229888,"Cuando Massa se hace el serio o el estadista, ...",2023-10-23 00:00:00+00:00


### Clean and analysis of data

In [8]:
print(f"The number of rows is {df.shape[0]}")

The number of rows is 20739


In [9]:
# Check unicity of values

if df['id'].nunique() == len(df):
    print("All values are unique.")
else:
    num_duplicates = df.duplicated().sum()
    print(f"There are duplicate {num_duplicates} values.")

There are duplicate 2312 values.


In [10]:
# Delate duplicated values
df.drop_duplicates(inplace=True)

In [11]:
# Check unicity of ids

if df['id'].nunique() == len(df):
    print("All values are unique.")
else:
    num_duplicates = df.duplicated().sum()
    print(f"There are duplicate {num_duplicates} values.")

There are duplicate 0 values.


In [12]:
print(f"The number of rows is {df.shape[0]}")

The number of rows is 18427


In [19]:
import pandas as pd
import plotly.express as px

# Extract just the date part
df['date'] = df['timestamp'].dt.date

# Count the number of tweets per day
tweet_counts = df['date'].value_counts().reset_index()
tweet_counts.columns = ['Date', 'Number of Tweets']

# Sort by date
tweet_counts = tweet_counts.sort_values('Date')

# Plotting with Plotly
fig = px.bar(tweet_counts, x='Date', y='Number of Tweets', title='Distribution of Tweets Per Day')
fig.update_xaxes(title_text='Date', tickangle=50)
fig.update_yaxes(title_text='Number of Tweets')
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [18]:
!pip install nbformat==4.3.0

Collecting nbformat==4.3.0
  Downloading nbformat-4.3.0-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting ipython-genutils (from nbformat==4.3.0)
  Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl.metadata (755 bytes)
Downloading nbformat-4.3.0-py2.py3-none-any.whl (154 kB)
   ---------------------------------------- 0.0/154.8 kB ? eta -:--:--
   ----------------------------- ---------- 112.6/154.8 kB 2.2 MB/s eta 0:00:01
   ------------------------------- -------- 122.9/154.8 kB 2.4 MB/s eta 0:00:01
   ------------------------------- -------- 122.9/154.8 kB 2.4 MB/s eta 0:00:01
   -------------------------------------  153.6/154.8 kB 833.5 kB/s eta 0:00:01
   -------------------------------------- 154.8/154.8 kB 768.9 kB/s eta 0:00:00
Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)
Installing collected packages: ipython-genutils, nbformat
  Attempting uninstall: nbformat
    Found existing installation: nbformat 5.9.2
    Uninstalling nbformat-5.9.2:
      Succe


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


### Preprocess text

In [None]:
!pip install spanish_nlp

In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [None]:
from spanish_nlp import preprocess

def preprocess_text(text):
    sp = preprocess.SpanishPreprocess(
        lower=True,
        remove_url=True,
        remove_hashtags=False,
        split_hashtags=True,
        normalize_breaklines=True,
        remove_emoticons=True,
        remove_emojis=True,
        convert_emoticons=False,
        convert_emojis=False,
        normalize_inclusive_language=True,
        reduce_spam=True,
        remove_vowels_accents=True,
        remove_multiple_spaces=True,
        remove_punctuation=True,
        remove_unprintable=True,
        remove_numbers=True,
        remove_stopwords=True,
        stopwords_list="default",
        lemmatize=False,
        stem=False,
        remove_html_tags=True,
    )
    return sp.transform(text, debug=False)

# Apply the preprocessing function to create a new column
df['preprocessed_text'] = df['text'].apply(preprocess_text)


In [None]:
# Preprocess for translation

from spanish_nlp import preprocess

def preprocessed_text_translation(text):
    text = text.replace("\n", "")
    sp = preprocess.SpanishPreprocess(
        lower=True,
        remove_url=True,
        remove_hashtags=False,
        split_hashtags=True,
        normalize_breaklines=True,
        remove_emoticons=True,
        remove_emojis=True,
        convert_emoticons=False,
        convert_emojis=False,
        normalize_inclusive_language=True,
        reduce_spam=False,
        remove_vowels_accents=True,
        remove_multiple_spaces=True,
        remove_punctuation=True,
        remove_unprintable=True,
        remove_numbers=True,
        remove_stopwords=False,
        stopwords_list="default",
        lemmatize=False,
        stem=False,
        remove_html_tags=True,
    )
    return sp.transform(text, debug=False)

test_text = """𝓣𝓮𝔁𝓽𝓸 𝓭𝓮 𝓹𝓻𝓾𝓮𝓫𝓪

<b>Holaaaaaaaa a todxs </b>, este es un texto de prueba :) a continuación\n les mostraré un poema de Roberto Bolaño llamado "Los perros románticos" 🤭👀😅

https://www.poesi.as/rb9301.htm

¡Me gustan los pingüinos! Sí, los PINGÜINOS 🐧🐧🐧 🐧 #VivanLosP\ninguinos #SíSeñor #PinguinosDelMundoUníos #ÑanduesDelMundoTambién

Si colaboras con este repositorio te puedes ganar $100.000 (en dinero falso). O tal vez 20 pingüinos. Mi teléfono es +561212121212"""

print("Example:\n",preprocessed_text_translation(test_text))

# Apply the preprocessing function to create a new column
df['preprocessed_text_translation'] = df['text'].apply(preprocessed_text_translation)


Example:
 hola a todos este es un texto de prueba a continuacion los mostrare un poema de roberto bolaño llamado los perros romanticos gustan los pinguinos si los pinguinos vivan los pinguinos si señor pinguinos del mundo unios ñandues del mundo tambien si colaboras con este repositorio te puedes ganar en dinero falso o tal vez pinguinos mi telefono es


In [None]:
# Create a new column to identify if text contains "massa", "milei", or both
df['mention'] = "None"
df.loc[df['preprocessed_text'].str.contains('massa', case=False), 'mention'] = 'Massa'
df.loc[df['preprocessed_text'].str.contains('milei', case=False), 'mention'] = 'Milei'
df.loc[df['preprocessed_text'].str.contains('massa', case=False) & df['text'].str.contains('milei', case=False), 'mention'] = 'Both'

In [None]:
import pandas as pd
import plotly.express as px


# Group by date and mention, then count the occurrences
mention_counts = df.groupby(['date', 'mention']).size().reset_index(name='counts')

# Plotting with Plotly
fig = px.bar(mention_counts, x='date', y='counts', color='mention', title='Distribution of Tweets Per Day')
fig.update_xaxes(title_text='Date', tickangle=50)
fig.update_yaxes(title_text='Number of Mentions')
fig.show()


In [None]:
import plotly.express as px

# Group by 'mention' and 'date', calculate mean of 'compound' column
# Then Pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_ori': 'mean'}).reset_index()
pivot_df_ori = grouped_df.pivot(index='date', columns='mention', values='compound_ori')

# Group by 'mention' and 'date', calculate mean of 'compound_trans' column
# Then pivot the DataFrame for plotting
grouped_df_trans = df.groupby(['mention', 'date']).agg({'compound_trans': 'mean'}).reset_index()
pivot_df_trans = grouped_df_trans.pivot(index='date', columns='mention', values='compound_trans')

# Create a plotly figure
fig = px.line(title='Mean Compound Score per Day for Each Mention')

# Define color sequence and line dash pattern
color_sequence = px.colors.qualitative.Plotly

# Add rectangular shapes for -0.05 and 0.05 behind the lines
fig.add_shape(type='rect', x0=pivot_df_ori.index.min(), y0=-0.05, x1=pivot_df_ori.index.max(), y1=0.05,
              line=dict(color='rgba(0,0,0,0)', width=0), fillcolor='rgba(211,211,211,0.5)', layer='below')

# Add trace for Original Text
for i, column in enumerate(pivot_df_ori.columns):
    fig.add_scatter(x=pivot_df_ori.index, y=pivot_df_ori[column], mode='lines', name=f'{column} (Original Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="dash"))

# Add trace for Preprocessed Translated Text
for i, column in enumerate(pivot_df_trans.columns):
    fig.add_scatter(x=pivot_df_trans.index, y=pivot_df_trans[column], mode='lines', name=f'{column} (Preprocess Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="solid"))

# Update axis labels
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')

# Add vertical line at x = '2023-11-13'
fig.add_shape(type='line', x0='2023-11-13', y0=pivot_df_ori.min().min(), x1='2023-11-13', y1=pivot_df_ori.max().max(),
              line=dict(color='black', width=2))

# Add annotation for November 13th
fig.add_annotation(x='2023-11-13', y=pivot_df_ori.max().max(), text="Nov 13", showarrow=True,
                   arrowhead=2, arrowcolor='black', ax=-40, ay=5, font=dict(size=13, color='black'))

# Show the plot

fig.update_layout(legend=dict(
    orientation="h", yanchor="bottom", y=-0.4, xanchor="auto", x=0.5
))
fig.show()


In [None]:
print("Preprocess data")
display(pivot_df_trans.describe())
print("Original data")
display(pivot_df_ori.describe())

Preprocess data


mention,Both,Massa,Milei
count,27.0,27.0,27.0
mean,-0.023736,-0.036038,-0.007885
std,0.063456,0.052935,0.051549
min,-0.137604,-0.162428,-0.143486
25%,-0.075144,-0.056771,-0.031804
50%,-0.016498,-0.03682,0.01223
75%,0.01322,-0.010928,0.032933
max,0.088577,0.061977,0.064896


Original data


mention,Both,Massa,Milei
count,27.0,27.0,27.0
mean,-0.016448,-0.028308,-0.003315
std,0.063126,0.053809,0.048849
min,-0.122032,-0.156004,-0.112117
25%,-0.07165,-0.052654,-0.021733
50%,-0.023499,-0.030586,0.011638
75%,0.038892,0.001912,0.032988
max,0.099028,0.064806,0.062421


Example of changes

In [None]:
df.iloc[3].text

'Si yo quisiera que el país colapse, diría que no hay que ir a votar.\n\nPero cómo tengo la ESPERANZA que AL MENOS no sea Presidente Massa, jugaré mí última carta votando a @JMilei\n\n¿Estamos todos arruinados o estamos bien?\n\nLo peor es que el kirchnerismo gobierne 4 años más. ¿Okey?'

Delate values that have no presidents

In [None]:
df["mention"].value_counts()

Milei    7722
Massa    6470
Both     4208
None       27
Name: mention, dtype: int64

In [None]:
df[df["mention"]=="None"].head()

Unnamed: 0,id,text,timestamp,date,preprocessed_text,mention
5491,1718354855423516928,Hasta acá llegué? Ok pero dame un poco de nafta,2023-10-29 00:00:00+00:00,2023-10-29,aca llegue ok dame poco nafta,
8498,1720230870584369664,Quien es? Solamente respuestas correcta:,2023-11-03 00:06:00+00:00,2023-11-03,quien es solamente respuestas correcta,
11529,1722217754718306816,"Buenos días Libertarios, hoy habla ella hoy ha...",2023-11-08 11:41:00+00:00,2023-11-08,buenos dias libertarios hoy habla hoy habla vi...,
12111,1722975739367108864,EXPLICACIÓN DE VOUCHERS,2023-11-10 13:53:00+00:00,2023-11-10,explicacion vouchers,
12694,1723361701892931584,RT hasta el infinito.,2023-11-11 15:27:00+00:00,2023-11-11,rt infinito,


In [None]:
# Delate all

df = df[df["mention"]!="None"]
print("Number of rows:", df.shape[0])

Number of rows: 18400


In [None]:
df["mention"].value_counts()

Milei    7722
Massa    6470
Both     4208
Name: mention, dtype: int64

## Sentiment analisis with VANDER


In [None]:
!pip install vader-multi

Collecting vader-multi
  Downloading vader_multi-3.2.2.1-py2.py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.7/126.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting translatte (from vader-multi)
  Downloading translatte-0.1-py3-none-any.whl (14 kB)
Installing collected packages: translatte, vader-multi
Successfully installed translatte-0.1 vader-multi-3.2.2.1


In [None]:
!pip install tqdm



From before analysis the column with the message "Javier Milei: "La casta son los políticos" had a problem. We will modify it before making the hole analysis

In [None]:
target_row = df[df['text'].str.startswith('Javier Milei: "La casta son los políticos')]
target_row

Unnamed: 0,id,text,timestamp,date,preprocessed_text,mention
14839,1724209788093940224,"Javier Milei: ""La casta son los políticos chor...",2023-11-13 23:37:00+00:00,2023-11-13,javier milei casta son politicos chorros chorr...,Milei


In [None]:
df["preprocessed_text"][14839] = df["preprocessed_text"][14839] + "."



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df["preprocessed_text"][14839]

'javier milei casta son politicos chorros chorros kirchneristas no hay.'

### Make analysis with complete data preprocess

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from tqdm import tqdm

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize lists to store sentiment scores
neg = []
pos = []
neu = []
comp = []

error_rows = []

# Iterate over the text in the DataFrame and perform sentiment analysis
for index, row in tqdm(df.iterrows()):
  try:
    text = row["preprocessed_text"]
    if text[-2:] == "no":
      text += "."
    scores = analyzer.polarity_scores(text)
    neg.append(scores["neg"])
    pos.append(scores["pos"])
    neu.append(scores["neu"])
    comp.append(scores["compound"])
  except:
    error_rows.append([index, row])

18400it [26:28, 11.58it/s]


In [None]:
# Print errors
print(error_rows)

[]


In [None]:
# Assign sentiment scores to new columns in the DataFrame using .loc[]
df.loc[:, 'negative'] = neg
df.loc[:, 'positive'] = pos
df.loc[:, 'neutral'] = neu
df.loc[:, 'compound'] = comp

In [None]:
df.head()

Unnamed: 0,id,text,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound
0,1716605183940022784,Massa ya ganó\n\nAhora hay que recuperar los ...,2023-10-23 00:00:00+00:00,2023-10-23,massa gano ahora hay recuperar ss publicos nac...,Massa,0.0,0.172,0.828,0.7003
1,1716605147164373248,Me burlaría de los Argentinos por votar por Se...,2023-10-23 00:00:00+00:00,2023-10-23,me burlaria argentinos votar sergio massa lueg...,Massa,0.0,0.212,0.788,0.7351
2,1716601976471843072,Apoyar a Sergio Massa es apoyar al Kirchnerism...,2023-10-23 00:00:00+00:00,2023-10-23,apoyar sergio massa es apoyar al kirchnerismo ...,Massa,0.0,0.537,0.463,0.7003
3,1716601291114189056,"Si yo quisiera que el país colapse, diría que ...",2023-10-23 00:00:00+00:00,2023-10-23,si quisiera pais colapse diria no hay ir votar...,Both,0.225,0.134,0.642,-0.7269
4,1716599848156229888,"Cuando Massa se hace el serio o el estadista, ...",2023-10-23 00:00:00+00:00,2023-10-23,cuando massa se hace serio estadista causa rep...,Massa,0.126,0.0,0.874,-0.0772


In [None]:
print("Compound in dataframe:", df["compound"][20730])
text = df["preprocessed_text"][20730] # get text
score = analyzer.polarity_scores(text) # analyse
print("Calculated:", score)

Compound in dataframe: -0.296
Calculated: {'neg': 0.268, 'neu': 0.732, 'pos': 0.0, 'compound': -0.296}


### Make analysis with preprocess data for then translation

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from tqdm import tqdm

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize lists to store sentiment scores
neg = []
pos = []
neu = []
comp = []

error_rows = []

# Iterate over the text in the DataFrame and perform sentiment analysis
for index, row in tqdm(df.iterrows()):
  try:
    text = row["preprocessed_text_translation"]
    if text[-2:] == "no":
      text += "."
    scores = analyzer.polarity_scores(text)
    neg.append(scores["neg"])
    pos.append(scores["pos"])
    neu.append(scores["neu"])
    comp.append(scores["compound"])
  except:
    error_rows.append([index, row])

18400it [20:10, 15.21it/s]


In [None]:
print(len(error_rows))

1


In [None]:
neg_add = neg.copy()
pos_add = pos.copy()
neu_add = neu.copy()
comp_add = comp.copy()

for idx, row in error_rows:
  print(row.preprocessed_text_translation)
  scores = analyzer.polarity_scores(row.preprocessed_text_translation + ".")
  neg_add.insert(idx, scores["neg"])
  pos_add.insert(idx, scores["pos"])
  neu_add.insert(idx, scores["neu"])
  comp_add.insert(idx, scores["compound"])

vaya a gogle y vea lo que dice massa por si o por no sergio massa es cierto que usted es la misma persona que juro no volver mas al kirchnerismo mi columna en radiomitre ledoymipalabraledoymipalabra com por si o


In [None]:
# Assign sentiment scores to new columns in the DataFrame using .loc[]
df.loc[:, 'negative_trans'] = neg_add
df.loc[:, 'positive_trans'] = pos_add
df.loc[:, 'neutral_trans'] = neu_add
df.loc[:, 'compound_trans'] = comp_add

In [None]:
# Save df
#df.to_csv('/content/drive/MyDrive/DeepL project/Data/massa_milei_data.csv', index=False)

### Analyse non preprocess data

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from tqdm import tqdm

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize lists to store sentiment scores
neg = []
pos = []
neu = []
comp = []

error_rows = []

# Iterate over the text in the DataFrame and perform sentiment analysis
for index, row in tqdm(df.iterrows()):
  try:
    text = row["text"]
    if text[-2:] == "no":
      text += "."
    scores = analyzer.polarity_scores(text)
    neg.append(scores["neg"])
    pos.append(scores["pos"])
    neu.append(scores["neu"])
    comp.append(scores["compound"])
  except:
    error_rows.append([index, row])

18400it [42:07,  7.28it/s]


In [None]:
print(f"Number of analysed rows: {len(comp)}\nNumber of not analysed rows: {len(error_rows)}")

Number of analysed rows: 18379
Number of not analysed rows: 21


In [None]:
# Analyse missing rows
neg_add = neg.copy()
pos_add = pos.copy()
neu_add = neu.copy()
comp_add = comp.copy()

for idx, row in error_rows:
  score = analyzer.polarity_scores(row.text + ".")
  neg_add.insert(idx, score["neg"])
  pos_add.insert(idx, score["pos"])
  neu_add.insert(idx, score["neu"])
  comp_add.insert(idx, score["compound"])

In [None]:
# Check correct assignation
number = 8648
print(comp_add[number])
analyzer.polarity_scores(df.loc[number].text + ".")

0.4522


{'neg': 0.0, 'neu': 0.802, 'pos': 0.198, 'compound': 0.4522}

In [None]:
# Assign sentiment scores to new columns in the DataFrame using .loc[] we add ori from original
df.loc[:, 'negative_ori'] = neg_add
df.loc[:, 'positive_ori'] = pos_add
df.loc[:, 'neutral_ori'] = neu_add
df.loc[:, 'compound_ori'] = comp_add

In [None]:
df.head()

Unnamed: 0,id,text,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound,negative_ori,positive_ori,neutral_ori,compound_ori
0,1716605183940022784,Massa ya ganó\n\nAhora hay que recuperar los ...,2023-10-23 00:00:00+00:00,2023-10-23,massa gano ahora hay recuperar ss publicos nac...,Massa,0.0,0.172,0.828,0.7003,0.0,0.132,0.868,0.7003
1,1716605147164373248,Me burlaría de los Argentinos por votar por Se...,2023-10-23 00:00:00+00:00,2023-10-23,me burlaria argentinos votar sergio massa lueg...,Massa,0.0,0.212,0.788,0.7351,0.066,0.17,0.764,0.5647
2,1716601976471843072,Apoyar a Sergio Massa es apoyar al Kirchnerism...,2023-10-23 00:00:00+00:00,2023-10-23,apoyar sergio massa es apoyar al kirchnerismo ...,Massa,0.0,0.537,0.463,0.7003,0.0,0.492,0.508,0.7003
3,1716601291114189056,"Si yo quisiera que el país colapse, diría que ...",2023-10-23 00:00:00+00:00,2023-10-23,si quisiera pais colapse diria no hay ir votar...,Both,0.225,0.134,0.642,-0.7269,0.149,0.176,0.676,0.2679
4,1716599848156229888,"Cuando Massa se hace el serio o el estadista, ...",2023-10-23 00:00:00+00:00,2023-10-23,cuando massa se hace serio estadista causa rep...,Massa,0.126,0.0,0.874,-0.0772,0.355,0.0,0.645,-0.6697


In [None]:
# Save df
#df.to_csv('/content/drive/MyDrive/DeepL project/Data/massa_milei_data.csv', index=False)

### Load csv

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/DeepL project/Data/massa_milei_data.csv')

# Display the DataFrame
df.head()

Unnamed: 0,id,text,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound,negative_ori,positive_ori,neutral_ori,compound_ori,preprocessed_text_translation,negative_trans,positive_trans,neutral_trans,compound_trans
0,1716605183940022784,Massa ya ganó\n\nAhora hay que recuperar los ...,2023-10-23 00:00:00+00:00,2023-10-23,massa gano ahora hay recuperar ss publicos nac...,Massa,0.0,0.172,0.828,0.7003,0.0,0.132,0.868,0.7003,massa ya ganoahora hay que recuperar los ss pu...,0.0,0.054,0.946,0.2732
1,1716605147164373248,Me burlaría de los Argentinos por votar por Se...,2023-10-23 00:00:00+00:00,2023-10-23,me burlaria argentinos votar sergio massa lueg...,Massa,0.0,0.212,0.788,0.7351,0.066,0.17,0.764,0.5647,me burlaria de los argentinos por votar por se...,0.066,0.17,0.764,0.5647
2,1716601976471843072,Apoyar a Sergio Massa es apoyar al Kirchnerism...,2023-10-23 00:00:00+00:00,2023-10-23,apoyar sergio massa es apoyar al kirchnerismo ...,Massa,0.0,0.537,0.463,0.7003,0.0,0.492,0.508,0.7003,apoyar a sergio massa es apoyar al kirchnerism...,0.0,0.492,0.508,0.7003
3,1716601291114189056,"Si yo quisiera que el país colapse, diría que ...",2023-10-23 00:00:00+00:00,2023-10-23,si quisiera pais colapse diria no hay ir votar...,Both,0.225,0.134,0.642,-0.7269,0.149,0.176,0.676,0.2679,si yo quisiera que el pais colapse diria que n...,0.163,0.125,0.712,-0.5789
4,1716599848156229888,"Cuando Massa se hace el serio o el estadista, ...",2023-10-23 00:00:00+00:00,2023-10-23,cuando massa se hace serio estadista causa rep...,Massa,0.126,0.0,0.874,-0.0772,0.355,0.0,0.645,-0.6697,cuando massa se hace el serio o el estadista c...,0.355,0.0,0.645,-0.6697


### Results

In [None]:
# Count the frequency of each category in mention column
category_counts = df['mention'].value_counts()

# Create a pie chart
fig = px.pie(names=category_counts.index, values=category_counts.values, title='Pie Chart of Categories')
fig.show()

In [None]:
# Group by 'mention' and 'date', calculate mean of 'compound' column
grouped_df = df.groupby(['mention', 'date']).agg({'compound': 'mean'}).reset_index()

# Pivot the DataFrame for plotting
pivot_df = grouped_df.pivot(index='date', columns='mention', values='compound')


# Assuming pivot_df is your DataFrame obtained from pivoting
fig = px.line(pivot_df, x=pivot_df.index, y=pivot_df.columns, title='Mean Compound Score per Day for Each Mention (Preprocess Text)', markers=True)
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')
fig.show()

In [None]:
import plotly.express as px

# Group by 'mention' and 'date', calculate mean of 'compound' column
# Then Pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound': 'mean'}).reset_index()
pivot_df = grouped_df.pivot(index='date', columns='mention', values='compound')

# Group by 'mention' and 'date', calculate mean of 'compound_trans' column
# Then pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_ori': 'mean'}).reset_index()
pivot_df_ori = grouped_df.pivot(index='date', columns='mention', values='compound_ori')

# Create a plotly figure
fig = px.line(title='Mean Compound Score per Day for Each Mention')

# Define color sequence and line dash pattern
color_sequence = px.colors.qualitative.Plotly
line_dash_sequence = ['solid', 'dot', 'dash', 'longdash', 'dashdot']

# Add trace for Preprocess Text
for i, column in enumerate(pivot_df.columns):
    fig.add_scatter(x=pivot_df.index, y=pivot_df[column], mode='lines', name=f'{column} (Preprocess Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="solid"))

# Add trace for Original Text
for i, column in enumerate(pivot_df_ori.columns):
    fig.add_scatter(x=pivot_df_ori.index, y=pivot_df_ori[column], mode='lines', name=f'{column} (Original Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="dash"))

# Update axis labels
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Group by 'mention' and 'date', calculate mean of 'compound' column
# Then Pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound': 'mean'}).reset_index()
pivot_df = grouped_df.pivot(index='date', columns='mention', values='compound')

# Group by 'mention' and 'date', calculate mean of 'compound_trans' column
# Then pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_trans': 'mean'}).reset_index()
pivot_df_trans = grouped_df.pivot(index='date', columns='mention', values='compound_trans')

# Create a plotly figure
fig = px.line(title='Mean Compound Score per Day for Each Mention')

# Define color sequence and line dash pattern
color_sequence = px.colors.qualitative.Plotly
line_dash_sequence = ['solid', 'dot', 'dash', 'longdash', 'dashdot']

# Add trace for Preprocess Text
for i, column in enumerate(pivot_df.columns):
    fig.add_scatter(x=pivot_df.index, y=pivot_df[column], mode='lines', name=f'{column} (Preprocess Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="solid"))

# Add trace for Original Text
for i, column in enumerate(pivot_df_trans.columns):
    fig.add_scatter(x=pivot_df_trans.index, y=pivot_df_trans[column], mode='lines', name=f'{column} (Pre-Translated Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="dash"))

# Update axis labels
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Group by 'mention' and 'date', calculate mean of 'compound' column
# Then Pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_ori': 'mean'}).reset_index()
pivot_df_ori = grouped_df.pivot(index='date', columns='mention', values='compound_ori')

# Group by 'mention' and 'date', calculate mean of 'compound_trans' column
# Then pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_trans': 'mean'}).reset_index()

pivot_df_trans = grouped_df.pivot(index='date', columns='mention', values='compound_trans')

# Create a plotly figure
fig = px.line(title='Mean Compound Score per Day for Each Mention')

# Define color sequence and line dash pattern
color_sequence = px.colors.qualitative.Plotly
line_dash_sequence = ['solid', 'dot', 'dash', 'longdash', 'dashdot']

# Add trace for Original Text
for i, column in enumerate(pivot_df.columns):
    fig.add_scatter(x=pivot_df_ori.index, y=pivot_df_ori[column], mode='lines', name=f'{column} (Original Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="solid"))

# Add trace for Preproces for translating Text
for i, column in enumerate(pivot_df_trans.columns):
    fig.add_scatter(x=pivot_df_trans.index, y=pivot_df_trans[column], mode='lines', name=f'{column} (Pre-Translated Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="dash"))

# Update axis labels
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Group by 'mention' and 'date', calculate mean of 'compound' column
# Then Pivot the DataFrame for plotting
grouped_df = df.groupby(['mention', 'date']).agg({'compound_ori': 'mean'}).reset_index()
pivot_df_ori = grouped_df.pivot(index='date', columns='mention', values='compound_ori')

# Group by 'mention' and 'date', calculate mean of 'compound_trans' column
# Then pivot the DataFrame for plotting
grouped_df_trans = df.groupby(['mention', 'date']).agg({'compound_trans': 'mean'}).reset_index()
pivot_df_trans = grouped_df_trans.pivot(index='date', columns='mention', values='compound_trans')

# Create a plotly figure
fig = px.line(title='Mean Compound Score per Day for Each Mention')

# Define color sequence and line dash pattern
color_sequence = px.colors.qualitative.Plotly

# Add rectangular shapes for -0.05 and 0.05 behind the lines
fig.add_shape(type='rect', x0=pivot_df_ori.index.min(), y0=-0.05, x1=pivot_df_ori.index.max(), y1=0.05,
              line=dict(color='rgba(0,0,0,0)', width=0), fillcolor='rgba(211,211,211,0.5)', layer='below')

# Add trace for Original Text
for i, column in enumerate(pivot_df_ori.columns):
    fig.add_scatter(x=pivot_df_ori.index, y=pivot_df_ori[column], mode='lines', name=f'{column} (Original Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="dash"))

# Add trace for Preprocessed Translated Text
for i, column in enumerate(pivot_df_trans.columns):
    fig.add_scatter(x=pivot_df_trans.index, y=pivot_df_trans[column], mode='lines', name=f'{column} (Preprocess Text)',
                    line=dict(color=color_sequence[i % len(color_sequence)], dash="solid"))

# Update axis labels
fig.update_xaxes(title='Date', tickangle=45)
fig.update_yaxes(title='Mean Compound Score')

# Add vertical line at x = '2023-11-13'
fig.add_shape(type='line', x0='2023-11-13', y0=pivot_df_ori.min().min(), x1='2023-11-13', y1=pivot_df_ori.max().max(),
              line=dict(color='black', width=2))

# Add annotation for November 13th
fig.add_annotation(x='2023-11-13', y=pivot_df_ori.max().max(), text="Nov 13", showarrow=True,
                   arrowhead=2, arrowcolor='black', ax=-50, ay=20, font=dict(size=13, color='black'),
                   xanchor='left')

# Move legend to bottom and center
fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.4, xanchor="center", x=0.5))

# Show the plot
fig.show()


## Create a Random Sample to label

In [None]:
# Set the random seed
seed = 42

# Take a random sample of 500 rows with the specified seed
random_sample = df.sample(n=150, random_state=seed)

# Add a new column at the third position
random_sample.insert(2, 'label', value=0)

# Export the DataFrame to a CSV file
#random_sample.to_csv('/content/drive/MyDrive/DeepL project/Data/random_sample.csv', index=False)

random_sample.head(2)

Unnamed: 0,id,text,label,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound,negative_ori,positive_ori,neutral_ori,compound_ori,preprocessed_text_translation,negative_trans,positive_trans,neutral_trans,compound_trans
13420,1724199231026602240,ESCHUCHEN COMO ESTE PIBE QUE VOTA A MILEI ATIE...,0,2023-11-13 22:55:00+00:00,2023-11-13,eschuchen pibe vota milei atiende al cronista ...,Milei,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,eschuchen como este pibe que vota a milei atie...,0.0,0.0,1.0,0.0
2360,1717226371666125312,"Pobre Viviana Canosa de amar a Milei, a odiar ...",0,2023-10-25 00:00:00+00:00,2023-10-25,pobre viviana canosa amar milei odiar milei am...,Milei,0.25,0.309,0.441,0.3818,0.199,0.242,0.559,0.34,pobre viviana canosa de amar a milei a odiar a...,0.205,0.25,0.545,0.34


In [None]:
# Once label, read the label
google_sheets_url = "/content/drive/MyDrive/DeepL project/Data/random_sample_label.csv"
random_sample_df = pd.read_csv(google_sheets_url)
random_sample_df.head(2)

Unnamed: 0,id,text,label,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound
0,1724199231026602240,ESCHUCHEN COMO ESTE PIBE QUE VOTA A MILEI ATIE...,0,2023-11-13 22:55:00+00:00,2023-11-13,eschuchen pibe vota milei atiende al cronista ...,Milei,0.0,0.0,1.0,0.0
1,1717226371666125312,"Pobre Viviana Canosa de amar a Milei, a odiar ...",-1,2023-10-25 00:00:00+00:00,2023-10-25,pobre viviana canosa amar milei odiar milei am...,Milei,0.25,309.0,441.0,3.818


In [None]:
# Copy labels given before (this because google sheets change numbers format)
random_sample_label = random_sample.copy()
random_sample_label["label"] = random_sample_df["label"].values.copy()
random_sample_label.head(2)

Unnamed: 0,id,text,label,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound,negative_ori,positive_ori,neutral_ori,compound_ori,preprocessed_text_translation,negative_trans,positive_trans,neutral_trans,compound_trans
13420,1724199231026602240,ESCHUCHEN COMO ESTE PIBE QUE VOTA A MILEI ATIE...,0,2023-11-13 22:55:00+00:00,2023-11-13,eschuchen pibe vota milei atiende al cronista ...,Milei,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,eschuchen como este pibe que vota a milei atie...,0.0,0.0,1.0,0.0
2360,1717226371666125312,"Pobre Viviana Canosa de amar a Milei, a odiar ...",-1,2023-10-25 00:00:00+00:00,2023-10-25,pobre viviana canosa amar milei odiar milei am...,Milei,0.25,0.309,0.441,0.3818,0.199,0.242,0.559,0.34,pobre viviana canosa de amar a milei a odiar a...,0.205,0.25,0.545,0.34


In [None]:
random_sample_label["label"].value_counts()

 0    69
-1    59
 1    22
Name: label, dtype: int64

In [None]:
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

neg_count = 0
pos_count = 0
neu_count = 0

neg_correct = 0
pos_correct = 0
neu_correct = 0

threshold = 0.05

for index, row in random_sample_label.iterrows():
  if  row["compound_trans"] <= -0.05:
    neg_count += 1
    if row["label"] == -1:
      neg_correct += 1

  elif row["compound_trans"] >= 0.05:
    pos_count += 1
    if row["label"] == 1:
      pos_correct += 1

  else:
    neu_count += 1
    if row["label"] == 0:
      neu_correct += 1


print(f"Precision of neg: {neg_correct/neg_count:.3f}")
print(f"Precision of pos: {pos_correct/pos_count:.3f}")
print(f"Precision of neu: {neu_correct/neu_count:.3f}")
print(f"Mean precision {(neu_correct/neu_count + pos_correct/pos_count + neg_correct/neg_count)/3:.3f}\n")

print(f"Recall of neg: {neg_correct/total_neg:.3f}")
print(f"Recall of pos: {pos_correct/total_pos:.3f}")
print(f"Recall of neu: {neu_correct/total_neu:.3f}")
print(f"Mean recall {(neu_correct/total_neu + pos_correct/total_pos + neg_correct/total_neg)/3:.3f}\n")

Precision of neg: 0.587
Precision of pos: 0.269
Precision of neu: 0.826
Mean precision 0.561

Recall of neg: 0.746
Recall of pos: 0.636
Recall of neu: 0.275
Mean recall 0.552



In [None]:
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

neg_count = 0
pos_count = 0
neu_count = 0

neg_correct = 0
pos_correct = 0
neu_correct = 0

threshold = 0.05

for index, row in random_sample_label.iterrows():
  if  row["compound_ori"] <= -0.475:
    neg_count += 1
    if row["label"] == -1:
      neg_correct += 1

  elif row["compound_ori"] >= 0.737:
    pos_count += 1
    if row["label"] == 1:
      pos_correct += 1

  else:
    neu_count += 1
    if row["label"] == 0:
      neu_correct += 1


print(f"Precision of neg: {neg_correct/neg_count:.3f}")
print(f"Precision of pos: {pos_correct/pos_count:.3f}")
print(f"Precision of neu: {neu_correct/neu_count:.3f}")
print(f"Mean precision {(neu_correct/neu_count + pos_correct/pos_count + neg_correct/neg_count)/3:.3f}\n")

print(f"Recall of neg: {neg_correct/total_neg:.3f}")
print(f"Recall of pos: {pos_correct/total_pos:.3f}")
print(f"Recall of neu: {neu_correct/total_neu:.3f}")
print(f"Mean recall {(neu_correct/total_neu + pos_correct/total_pos + neg_correct/total_neg)/3:.3f}\n")

Precision of neg: 0.727
Precision of pos: 0.500
Precision of neu: 0.602
Mean precision 0.610

Recall of neg: 0.542
Recall of pos: 0.409
Recall of neu: 0.768
Mean recall 0.573



In [None]:
import numpy as np
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

best_threshold_first = 0
best_threshold_second = 0
best_precision = 0
best_recall = 0

threshold_values = np.linspace(0.05, 1, num=30)

# for threshold_first in [-0.05, -0.1, -0.155, -0.16, -0.225, -0.4]:  # Varying threshold values for first compound
#     for threshold_second in [0.05, 0.13, 0.15, 0.17, 0.25, 0.7, 0.8]:  # Varying threshold values for second compound
for threshold_first in -threshold_values:  # Varying threshold values for first compound
    for threshold_second in threshold_values:  # Varying threshold values for second compound
        neg_count = 0
        pos_count = 0
        neu_count = 0

        neg_correct = 0
        pos_correct = 0
        neu_correct = 0

        for index, row in random_sample_label.iterrows():
            if row["compound_ori"] <= threshold_first:  # Varying threshold for first compound
                neg_count += 1
                if row["label"] == -1:
                    neg_correct += 1
            elif row["compound_ori"] >= threshold_second:  # Varying threshold for second compound
                pos_count += 1
                if row["label"] == 1:
                    pos_correct += 1
            else:
                neu_count += 1
                if row["label"] == 0:
                    neu_correct += 1

        precision_neg = neg_correct / neg_count if neg_count > 0 else 0
        precision_pos = pos_correct / pos_count if pos_count > 0 else 0
        precision_neu = neu_correct / neu_count if neu_count > 0 else 0

        recall_neg = neg_correct / total_neg
        recall_pos = pos_correct / total_pos
        recall_neu = neu_correct / total_neu

        # Calculate F1-score
        f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg) if (precision_neg + recall_neg) > 0 else 0
        f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos) if (precision_pos + recall_pos) > 0 else 0
        f1_neu = 2 * (precision_neu * recall_neu) / (precision_neu + recall_neu) if (precision_neu + recall_neu) > 0 else 0

        # Taking the average of F1-scores across all classes as the performance metric
        performance_metric = (f1_neg + f1_pos + f1_neu) / 3

        # Update best thresholds if the current performance is better
        if performance_metric > best_precision:
            best_precision = performance_metric
            best_threshold_first = threshold_first
            best_threshold_second = threshold_second
            best_recall = (recall_neg + recall_pos + recall_neu) / 3

print(f"Best threshold for the first compound: {best_threshold_first}")
print(f"Best threshold for the second compound: {best_threshold_second}")
print(f"Best precision: {best_precision:.3f}")
print(f"Best recall: {best_recall:.3f}")


Best threshold for the first compound: -0.4758620689655172
Best threshold for the second compound: 0.7379310344827585
Best precision: 0.582
Best recall: 0.573


In [None]:
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

neg_count = 0
pos_count = 0
neu_count = 0

neg_correct = 0
pos_correct = 0
neu_correct = 0

for index, row in random_sample_label.iterrows():
  if  row["negative"] > 0.15:
    if row["negative"] - row["positive"] > 0.01:
      neg_count += 1
      if row["label"] == -1:
        neg_correct += 1

  elif row["positive"]>0.05:
    if row["positive"] - row["negative"] > 0.01:
      pos_count += 1
      if row["label"] == 1:
        pos_correct += 1

  else:
    neu_count += 1
    if row["label"] == 0:
      neu_correct += 1


print(f"Precision of neg: {neg_correct/neg_count:.3f}")
print(f"Precision of pos: {pos_correct/pos_count:.3f}")
print(f"Precision of neu: {neu_correct/neu_count:.3f}\n")

print(f"Recall of neg: {neg_correct/total_neg:.3f}")
print(f"Recall of pos: {pos_correct/total_pos:.3f}")
print(f"Recall of neu: {neu_correct/total_neu:.3f}")

Precision of neg: 0.686
Precision of pos: 0.327
Precision of neu: 0.700

Recall of neg: 0.593
Recall of pos: 0.727
Recall of neu: 0.304


In [None]:
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

neg_count = 0
pos_count = 0
neu_count = 0

neg_correct = 0
pos_correct = 0
neu_correct = 0

threshold = 0.05
count = 0

for index, row in random_sample_label.iterrows():
  if  row["compound_trans"] <= -0.001 and row["neutral_trans"]>0.3:
    neg_count += 1
    if row["label"] == -1:
      neg_correct += 1
      # if count == 8:
      #   print(row["text"])
      #   print(row["compound_trans"], row["positive_trans"], row["neutral_trans"], row["negative_trans"])
      #   print(row["label"])
      #   break
      # count += 1
    else:
      if count == 7:
        print(row["text"])
        print(row["compound_trans"], row["positive_trans"], row["neutral_trans"], row["negative_trans"])
        print(row["label"])
        break
      count += 1

  elif row["compound_trans"] >= 0.8:
    pos_count += 1
    if row["label"] == 1:
      pos_correct += 1

  else:
    neu_count += 1
    if row["label"] == 0:
      neu_correct += 1


print(f"Precision of neg: {neg_correct/neg_count:.3f}")
print(f"Precision of pos: {pos_correct/pos_count:.3f}")
print(f"Precision of neu: {neu_correct/neu_count:.3f}\n")

print(f"Recall of neg: {neg_correct/total_neg:.3f}")
print(f"Recall of pos: {pos_correct/total_pos:.3f}")
print(f"Recall of neu: {neu_correct/total_neu:.3f}")

LA GENTE MUERTA DE HAMBRE Y EL CARTEL DE MASSA DE FONDO: ESTE VIDEO LE VA A HACER PERDER LA ELECCIÓN 😡😡😡

RT RT RT RT
-0.6705 0.0 0.8 0.2
0
Precision of neg: 0.680
Precision of pos: 0.500
Precision of neu: 0.500

Recall of neg: 0.288
Recall of pos: 0.045
Recall of neu: 0.145


### Analyse postive, negative and neutral languages

In [None]:
def classify_text(row):
  if  row["compound"] <= -0.148:
    return "negative"

  elif row["compound"] >= 0.475:
    return "positive"

  else:
    return "neutral"

df["label"] = df.apply(classify_text, axis=1)

In [None]:
df[["mention", "date", "label"]].head()

Unnamed: 0,mention,date,label
0,Massa,2023-10-23,positive
1,Massa,2023-10-23,positive
2,Massa,2023-10-23,positive
3,Both,2023-10-23,negative
4,Massa,2023-10-23,neutral


In [None]:
# Group by 'mention', 'date', and 'label' and count the occurrences
grouped_df = df.groupby(['mention', 'date', 'label']).size().reset_index(name='count')

# Plot using Plotly
fig = px.bar(grouped_df, x='date', y='count', color='label', barmode='group', facet_col='mention',
             category_orders={'label': ['positive', 'negative', 'neutral']})

# Update layout
fig.update_layout(title='Number of labels per type and mention per day',
                  xaxis_title='Date', yaxis_title='Count',
                  xaxis=dict(type='category', tickformat='%Y-%m-%d'))

# Show the plot
fig.show()

In [None]:
# Group by 'mention', 'date', and 'label' and count the occurrences
grouped_df = df.groupby(['mention', 'date', 'label']).size().reset_index(name='count')

# Calculate proportions
grouped_df['proportion'] = grouped_df.groupby(['mention', 'date'])['count'].transform(lambda x: x / x.sum())

# Plot using Plotly
fig = px.bar(grouped_df, x='date', y='proportion', color='label', barmode='group', facet_col='mention',
             category_orders={'label': ['positive', 'negative', 'neutral']})

# Update layout
fig.update_layout(title='Proportion of labels per type and mention per day',
                  xaxis_title='Date', yaxis_title='Proportion',
                  xaxis=dict(type='category', tickformat='%Y-%m-%d'))

# Show the plot
fig.show()

In [None]:
import plotly.express as px

# Group by 'mention', 'date', and 'label' and count the occurrences
grouped_df = df.groupby(['mention', 'date', 'label']).size().reset_index(name='count')

# Calculate proportions
grouped_df['proportion'] = grouped_df.groupby(['mention', 'date'])['count'].transform(lambda x: x / x.sum())

# Plot using Plotly
fig = px.line(grouped_df, x='date', y='proportion', color='label', facet_row='mention',
              category_orders={'label': ['positive', 'negative', 'neutral']},
              labels={'date': 'Date', 'proportion': 'Proportion'})

# Add scatter points
fig.update_traces(mode='markers+lines', marker=dict(size=8))

# Update layout
fig.update_layout(title='Proportion of labels per type and mention per day',
                  xaxis_title='Date', yaxis_title='Proportion')

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Group by 'mention', 'date', and 'label' and count the occurrences
grouped_df = df.groupby(['mention', 'date', 'label']).size().reset_index(name='count')

# Calculate proportions
grouped_df['proportion'] = grouped_df.groupby(['mention', 'date'])['count'].transform(lambda x: x / x.sum())

# Define color mapping
color_map = {'positive': 'green', 'negative': 'red', 'neutral': 'blue'}

# Plot using Plotly
fig = px.line(grouped_df, x='date', y='proportion', color='label', facet_row='mention',
              category_orders={'label': ['positive', 'negative', 'neutral']},
              color_discrete_map=color_map,
              labels={'date': 'Date', 'proportion': 'Proportion'})

# Add scatter points
fig.update_traces(mode='markers+lines', marker=dict(size=8))

# Update layout
fig.update_layout(title='Proportion of labels per type and mention per day',
                  xaxis_title='Date', yaxis_title='Proportion')

# Show the plot
fig.show()


### Try with Bert

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-spanish")
model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/dehatebert-mono-spanish")

In [None]:
import numpy as np
import torch

text = "Eres una persona muy buena" #df.loc[0].text
print(text)
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs)

print(logits)

predicted_class_id = logits.logits.argmax().item()
print(logits.logits.sum())
label = "HATE" if logits.logits.sum() < 0 else "NON_HATE"
print(label)

model.config.id2label[predicted_class_id]

Eres una persona muy buena
SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4009, -1.4690]]), hidden_states=None, attentions=None)
tensor(-0.0681)
HATE


'NON_HATE'

In [None]:
logits.logits

tensor([[ 0.6491, -0.8423]])

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="Hate-speech-CNERG/dehatebert-mono-spanish")

In [None]:
text = df.loc[5].text
print(text)

pipe("Te quiero")

Quieren campaña del miedo? 
Si gana Massa somos Venezuela.


[{'label': 'HATE', 'score': 0.5551392436027527}]

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="jorgeortizfuentes/spanish_hate_speech")

text = df.loc[5].text
print(text)

pipe(text)[0]["label"]

Quieren campaña del miedo? 
Si gana Massa somos Venezuela.


'no_hate'

In [None]:
# Load label sample
# Set the random seed
seed = 42

# Take a random sample of 500 rows with the specified seed
random_sample = df.sample(n=150, random_state=seed)

# Add a new column at the third position
random_sample.insert(2, 'label', value=0)

# Once label, read the label
google_sheets_url = "/content/drive/MyDrive/DeepL project/Data/random_sample_label.csv"
random_sample_df = pd.read_csv(google_sheets_url)
random_sample_df.head(2)

# Copy labels given before (this because google sheets change numbers format)
random_sample_label = random_sample.copy()
random_sample_label["label"] = random_sample_df["label"].values.copy()
random_sample_label.head(2)

Unnamed: 0,id,text,label,timestamp,date,preprocessed_text,mention,negative,positive,neutral,compound,negative_ori,positive_ori,neutral_ori,compound_ori
13420,1724199231026602240,ESCHUCHEN COMO ESTE PIBE QUE VOTA A MILEI ATIE...,0,2023-11-13 22:55:00+00:00,2023-11-13,eschuchen pibe vota milei atiende al cronista ...,Milei,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2360,1717226371666125312,"Pobre Viviana Canosa de amar a Milei, a odiar ...",-1,2023-10-25 00:00:00+00:00,2023-10-25,pobre viviana canosa amar milei odiar milei am...,Milei,0.25,0.309,0.441,0.3818,0.199,0.242,0.559,0.34


In [None]:
# compare performance
total_neg = 59
total_pos = 22
total_neu = 69

neg_count = 0
pos_count = 0
neu_count = 0

neg_correct = 0
pos_correct = 0
neu_correct = 0

for index, row in random_sample_label.iterrows():
  pred_label = pipe(row.text)[0]["label"]
  if pred_label == 'hate':
    neg_count += 1
    if row["label"] == -1:
      neg_correct += 1

  if pred_label == 'no_hate':
    pos_count += 1
    if row["label"] == 1:
      pos_correct += 1

  if row["neutral"]>0.4:
    neu_count += 1
    if row["label"] == 0:
      neu_correct += 1


print(f"Precision of neg: {neg_correct/neg_count:.3f}")
print(f"Precision of pos: {pos_correct/pos_count:.3f}")
print(f"Precision of neu: {neu_correct/neu_count:.3f}\n")

print(f"Recall of neg: {neg_correct/total_neg:.3f}")
print(f"Recall of pos: {pos_correct/total_pos:.3f}")
print(f"Recall of neu: {neu_correct/total_neu:.3f}")

ZeroDivisionError: division by zero

In [None]:
print(neg_correct, pos_correct)

0 22


In [None]:
!pip install tweetnlp



In [None]:
import tweetnlp
model = tweetnlp.Classifier("cardiffnlp/twitter-xlm-roberta-base-hate-spanish")
model.predict('Ismael es egocentrico porque se vuelve loca si le dicen que tiene el pelo bonito😂😂😂😂 eso se define con otro objetivo #FirstDates251')

Downloading config.json:   0%|          | 0.00/938 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

{'label': 'NOT-HATE'}

In [None]:
model2 = tweetnlp.SentimentMultilingual()  # Or `model = tweetnlp.SentimentMultilingual()`
model2.sentiment("天気が良いとやっぱり気持ち良いなあ✨")  # Or `model.predict`

AttributeError: module 'tweetnlp' has no attribute 'SentimentMultilingual'