# Sesgos de tweets con/sin tildes

In [1]:
import pandas as pd
import numpy as np

In [2]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  tweets = pd.read_csv('/content/drive/Shareddrives/ETICA/TweetMod_csv/tweet_mods_accents.csv')
else:
  tweets = pd.read_csv('../data/tweet_mods_accents.csv')

## Análisis de todos los datos

In [3]:
tweets.head()

Unnamed: 0,twitter_id,original_text,slug,party_slug,toxic_original,very_toxic_original,toxic_noaccent,very_toxic_noaccent,most_toxic_grammar,most_verytoxic_grammar
0,1344934218145669120,Llegará un momento que los miembros de este Go...,juan-luis-steegmann-olmedillas,vox,0.995289,0.996132,0.996114,0.993316,sin tildes,con tildes
1,1344940715072221184,La frase “ Falsa sensación de seguridad” viene...,juan-luis-steegmann-olmedillas,vox,0.987947,0.056416,0.985783,0.040629,con tildes,con tildes
2,1344945712128266240,El Comisario político del Gobierno no descansa...,macarena-montesinos-de-miguel,partido-popular,0.995368,0.995748,0.995438,0.995655,sin tildes,con tildes
3,1344962035277320193,Desde @CiudadanosCs presentamos una enmienda p...,,,0.016391,0.000838,0.017045,0.000813,sin tildes,con tildes
4,1344967124264116230,Y que mejor forma de dar la bienvenida al 2021...,maria-angeles-marra-dominguez,psoe,0.004446,0.002959,0.004253,0.003146,con tildes,sin tildes


In [4]:
print('Con tildes toxic: ', np.mean(tweets['toxic_original']))
print('Con tildes very toxic: ', np.mean(tweets['very_toxic_original']))
print('Sin tildes toxic: ', np.mean(tweets['toxic_noaccent']))
print('Sin tildes very toxic: ', np.mean(tweets['very_toxic_noaccent']))

Con tildes toxic:  0.29571437127105915
Con tildes very toxic:  0.08084586595985278
Sin tildes toxic:  0.300242413898577
Sin tildes very toxic:  0.08014870762158596


> Parece que en cuanto al valor "toxic", son un poco más tóxicos tweets sin tildes, pero vemos en el caso de "very toxic" es muy similar.    
> Aun así, parece que las medias están bastante igualadas.

In [5]:
# Aquí calculamos las veces que un tweet es más tóxico con tildes que sin tildes y viceversa
print('Toxic con tildes', np.count_nonzero(tweets.most_toxic_grammar == 'con tildes'))
print('Toxic sin tildes', np.count_nonzero(tweets.most_toxic_grammar == 'sin tildes'))
print('Very toxic con tildes', np.count_nonzero(tweets.most_verytoxic_grammar == 'con tildes'))
print('Very toxic sin tildes', np.count_nonzero(tweets.most_verytoxic_grammar == 'sin tildes'))

Toxic con tildes 4825
Toxic sin tildes 7223
Very toxic con tildes 7712
Very toxic sin tildes 4336


> En general, la variable "toxic" aumenta al no haber tildes pero la variable "very_toxic" es mayor con tildes

## Análisis de datos acotados
Ahora vamos a analizar tweets donde la ausencia de tildes cambia de manera significante la toxicidad. Solo miraremos tweets donde la diferencia sea superior a 10.

In [6]:
column_names = []
for i in tweets:
  column_names.append(i)
tweets_filtered = pd.DataFrame(columns = column_names)

In [7]:
for idx, row in tweets.iterrows():
  if abs(row.toxic_original - row.toxic_noaccent) > 0.10:
    tweets_filtered = tweets_filtered.append(row)

In [8]:
tweets_filtered

Unnamed: 0,twitter_id,original_text,slug,party_slug,toxic_original,very_toxic_original,toxic_noaccent,very_toxic_noaccent,most_toxic_grammar,most_verytoxic_grammar
5,1344978091807035392,Un año de oposición: a la peor pandemia PP y V...,maria-angeles-marra-dominguez,psoe,0.525792,0.001039,0.894197,0.002838,sin tildes,sin tildes
24,1345462766212165632,La rapidez del contagio de la cepa británica e...,,,0.506995,0.001616,0.265326,0.001083,con tildes,con tildes
44,1346018260161077248,Desde el 1 de enero están en vigor los presupu...,belen-hoyo-julia,partido-popular,0.288427,0.000770,0.052053,0.000721,con tildes,con tildes
61,1346160087652556800,López Miras criticaba que la Ministra de Traba...,javier-sanchez-serna,podemos,0.796276,0.002114,0.939439,0.006534,sin tildes,sin tildes
65,1346168575032250369,"“Carrizosa, si ETA estuviera no existirías”. \...",guillermo-diaz-gomez,ciudadanos,0.406232,0.001017,0.029191,0.000962,con tildes,con tildes
...,...,...,...,...,...,...,...,...,...,...
11964,1453083115858038791,El consenso al descubierto. El PP con Podemos ...,rocio-de-meer-mendez,vox,0.442898,0.001002,0.549917,0.001133,sin tildes,sin tildes
11987,1452483511919824898,A Al Capone también le parecía que la policía ...,juan-ignacio-lopez-bas-valero,ciudadanos,0.902248,0.003599,0.710073,0.001535,con tildes,con tildes
11994,1453014574450479112,"En la Comisión de Interior, hoy deconstruimos ...",ana-belen-vazquez-blanco,partido-popular,0.968546,0.011261,0.670001,0.001336,con tildes,con tildes
12023,1450521151982157828,"Los jóvenes migrantes suelen ser noticia, solo...",jaume-asens-llodra,catalunya-en-comu,0.781455,0.002105,0.556372,0.001411,con tildes,con tildes


In [9]:
print('Con tildes toxic: ', np.mean(tweets_filtered['toxic_original']))
print('Con tildes very toxic: ', np.mean(tweets_filtered['very_toxic_original']))
print('Sin tildes toxic: ', np.mean(tweets_filtered['toxic_noaccent']))
print('Sin tildes very toxic: ', np.mean(tweets_filtered['very_toxic_noaccent']))

Con tildes toxic:  0.4433678603518351
Con tildes very toxic:  0.0031834938606548945
Sin tildes toxic:  0.49841069664810655
Sin tildes very toxic:  0.00299137524673699


> Aquí vemos que en "toxic" la toxicidad es más alta en tweets sin tildes pero en "very_toxic" es más similar, siendo algo más alta en los tweets con tilde

In [10]:
print('Toxic con tildes', np.count_nonzero(tweets_filtered.most_toxic_grammar == 'con tildes'))
print('Toxic sin tildes', np.count_nonzero(tweets_filtered.most_toxic_grammar == 'sin tildes'))
print('Very toxic con tildes', np.count_nonzero(tweets_filtered.most_verytoxic_grammar == 'con tildes'))
print('Very toxic sin tildes', np.count_nonzero(tweets_filtered.most_verytoxic_grammar == 'sin tildes'))

Toxic con tildes 325
Toxic sin tildes 498
Very toxic con tildes 368
Very toxic sin tildes 455


> Parece que cuando solo miramos casos donde la diferencia de toxicidad es significante, es más tóxico (tanto "toxic" como "very_toxic") en cuanto no usan tildes