# Sesgos de tweets con/sin tildes

In [3]:
import pandas as pd
import numpy as np

In [4]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  tweets = pd.read_csv('/content/drive/Shareddrives/ETICA/TweetMod_csv/tweet_mods_accents.csv')
else:
  tweets = pd.read_csv('../data/tweet_mods_accents.csv')

## Análisis de todos los datos

In [5]:
tweets.head()

Unnamed: 0,twitter_id,original_text,slug,party_slug,toxic_original,very_toxic_original,toxic_noaccent,very_toxic_noaccent,most_toxic_grammar,most_verytoxic_grammar
0,1344934218145669120,Llegará un momento que los miembros de este Go...,juan-luis-steegmann-olmedillas,vox,0.995289,0.996132,0.996114,0.993316,sin tildes,con tildes
1,1344940715072221184,La frase “ Falsa sensación de seguridad” viene...,juan-luis-steegmann-olmedillas,vox,0.987947,0.056416,0.986203,0.040903,con tildes,con tildes
2,1344945712128266240,El Comisario político del Gobierno no descansa...,macarena-montesinos-de-miguel,partido-popular,0.995368,0.995748,0.996026,0.992559,sin tildes,con tildes
3,1344962035277320193,Desde @CiudadanosCs presentamos una enmienda p...,,,0.016391,0.000838,0.017184,0.000819,sin tildes,con tildes
4,1344967124264116230,Y que mejor forma de dar la bienvenida al 2021...,maria-angeles-marra-dominguez,psoe,0.004446,0.002959,0.004472,0.00284,sin tildes,con tildes


In [None]:
print('Con tildes toxic: ', np.mean(tweets['toxic_original']))
print('Con tildes very toxic: ', np.mean(tweets['very_toxic_original']))
print('Sin tildes toxic: ', np.mean(tweets['toxic_noaccent']))
print('Sin tildes very toxic: ', np.mean(tweets['very_toxic_noaccent']))
#Parece que en cuanto al valor "toxic", son más tóxicos tweets sin tildes, pero vemos el caso contrario con "very toxic". 
#Aun así, parece que las medias están bastante igualadas.

Con tildes toxic:  0.29571437127105915
Con tildes very toxic:  0.08084586595985283
Sin tildes toxic:  0.3086192665362216
Sin tildes very toxic:  0.0808059988280046


In [None]:
#Aquí calculamos las veces que un tweet es más tóxico con tildes que sin tildes y viceversa
print('Toxic con tildes', np.count_nonzero(tweets[tweets['most_toxic_grammar'] == 'con tildes']))
print('Toxic sin tildes', np.count_nonzero(tweets[tweets['most_toxic_grammar'] == 'sin tildes']))
print('Very toxic con tildes', np.count_nonzero(tweets[tweets['most_verytoxic_grammar'] == 'con tildes']))
print('Very toxic sin tildes', np.count_nonzero(tweets[tweets['most_verytoxic_grammar'] == 'sin tildes']))

Toxic con tildes 46490
Toxic sin tildes 73980
Very toxic con tildes 76300
Very toxic sin tildes 44180


## Análisis de datos acotados
Ahora vamos a analizar tweets donde la ausencia de tildes cambia de manera significante la toxicidad. Solo miraremos tweets donde la diferencia sea superior a 10.

In [None]:
column_names = []
for i in tweets:
  column_names.append(i)
tweets_filtered = pd.DataFrame(columns = column_names)

In [None]:
for idx, row in tweets.iterrows():
  if abs(row.toxic_original - row.toxic_noaccent) > 0.10:
    tweets_filtered = tweets_filtered.append(row)

In [None]:
tweets_filtered

Unnamed: 0,twitter_id,original_text,slug,party_slug,toxic_original,very_toxic_original,toxic_noaccent,very_toxic_noaccent,most_toxic_grammar,most_verytoxic_grammar
5,1344978091807035392,Un año de oposición: a la peor pandemia PP y V...,maria-angeles-marra-dominguez,psoe,0.525792,0.001039,0.918449,0.003510,sin tildes,sin tildes
14,1345133701643051009,Seguramente no esté ni un 10% de acuerdo con l...,maria-gloria-elizo-serrano,podemos,0.005645,0.002194,0.138315,0.000953,sin tildes,con tildes
24,1345462766212165632,La rapidez del contagio de la cepa británica e...,,,0.506995,0.001616,0.265326,0.001083,con tildes,con tildes
34,1345690120033349645,¿ Qué está pasando?\n\n➡️ Que los españoles ya...,javier-merino-martinez,partido-popular,0.316567,0.000900,0.968961,0.009733,sin tildes,sin tildes
44,1346018260161077248,Desde el 1 de enero están en vigor los presupu...,belen-hoyo-julia,partido-popular,0.288427,0.000770,0.025317,0.000835,con tildes,sin tildes
...,...,...,...,...,...,...,...,...,...,...
11994,1453014574450479112,"En la Comisión de Interior, hoy deconstruimos ...",ana-belen-vazquez-blanco,partido-popular,0.968546,0.011261,0.647917,0.001279,con tildes,con tildes
12011,1452567812501905410,Los españoles lo tienen claro: @pablocasado_ e...,,,0.060123,0.000780,0.187017,0.000939,sin tildes,sin tildes
12023,1450521151982157828,"Los jóvenes migrantes suelen ser noticia, solo...",jaume-asens-llodra,catalunya-en-comu,0.781455,0.002105,0.664364,0.001804,con tildes,con tildes
12039,1453748843078631445,🔵 Entre toda la desgracia que supone lo que es...,oscar-clavell-lopez,partido-popular,0.080677,0.000764,0.205307,0.000941,sin tildes,sin tildes


In [None]:
print('Con tildes toxic: ', np.mean(tweets_filtered['toxic_original']))
print('Con tildes very toxic: ', np.mean(tweets_filtered['very_toxic_original']))
print('Sin tildes toxic: ', np.mean(tweets_filtered['toxic_noaccent']))
print('Sin tildes very toxic: ', np.mean(tweets_filtered['very_toxic_noaccent'])
#Aquí tanto en "toxic" y en "very toxic" vemos que la toxicidad es más alta en tweets sin tildes

Con tildes toxic:  0.4201075223428939
Con tildes very toxic:  0.0031716618928559824
Sin tildes toxic:  0.5417214262054454
Sin tildes very toxic:  0.006511807569271818


In [None]:
print('Toxic con tildes', np.count_nonzero(tweets_filtered[tweets_filtered['most_toxic_grammar'] == 'con tildes']))
print('Toxic sin tildes', np.count_nonzero(tweets_filtered[tweets_filtered['most_toxic_grammar'] == 'sin tildes']))
print('Very toxic con tildes', np.count_nonzero(tweets_filtered[tweets_filtered['most_verytoxic_grammar'] == 'con tildes']))
print('Very toxic sin tildes', np.count_nonzero(tweets_filtered[tweets_filtered['most_verytoxic_grammar'] == 'sin tildes']))
#Parece que cuando solo miramos casos donde la diferencia de toxicidad es significante, es más tóxico en cuanto a ambos valores del modelo no usar
# tildes, y está más desigualado, favoreciendo más a los tweets con tildes.

Toxic con tildes 3930
Toxic sin tildes 7560
Very toxic con tildes 4730
Very toxic sin tildes 6760
