In [1]:
import pandas as pd
import requests
import io
import sklearn.model_selection as ms
from sklearn import svm
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


In [105]:
# Downloading the csv file from GitHub
url = "https://raw.githubusercontent.com/Aaron9812/Data_mining/main/data/220505_train_data_preprocessed.csv"
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe
df = pd.read_csv(io.StringIO(download.decode('utf-8')), sep=";")
print(df.columns)

Index(['id', 'label', 'tweet', 'n_mentions', 'hashtags', 'without_puctioation',
       'tweet_lower', 'tweet_token', 'clean_token', 'clean_hashtags',
       'stemmed_tokens', 'stemmed_hashtags', 'lemmatized_tokens',
       'lemmatized_hashtags', 'tfidf_stemmed_tokens', 'tfidf_stemmed_hashtags',
       'tfidf_lemmatized_tokens', 'tfidf_lemmatized_hashtags'],
      dtype='object')


In [106]:
df.head()

Unnamed: 0,id,label,tweet,n_mentions,hashtags,without_puctioation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags,tfidf_stemmed_tokens,tfidf_stemmed_hashtags,tfidf_lemmatized_tokens,tfidf_lemmatized_hashtags
0,12139,0,keisha grey and robin sadie exposes her mature breasts and pussy on reddit: #robin #keisha #grey #mature ...,0,"['robin', 'keisha', 'grey', 'mature']",keisha grey and robin sadie exposes her mature breasts and pussy on reddit robin keisha grey mature,keisha grey and robin sadie exposes her mature breasts and pussy on reddit robin keisha grey mature,"['grey', 'robin', 'exposes', 'and', 'on', 'robin', 'grey', 'mature']","['grey', 'robin', 'exposes', 'robin', 'grey', 'mature']","['robin', 'keisha', 'grey', 'mature']","['grey', 'robin', 'expos', 'robin', 'grey', 'matur']","['robin', 'keisha', 'grey', 'matur']","['grey', 'robin', 'expose', 'robin', 'grey', 'mature']","['robin', 'keisha', 'grey', 'mature']","[-10.740796089172363, -14.836583137512207]","[2.0505874156951904, 14.331243515014648]","[61.946311950683594, -11.877785682678223]","[1.136780023574829, 0.6984049081802368]"
1,248,0,ð· themeowood: puppy,0,[],ð· themeowood puppy,ð· themeowood puppy,['themeowood'],['themeowood'],[],['themeowood'],[],['themeowood'],[],"[-0.7807413339614868, 2.77905535697937]","[-18.723024368286133, 16.274951934814453]","[1.2441691160202026, 0.8361891508102417]","[-0.12042795121669769, 11.407605171203613]"
2,22915,0,#angels #archangels from on high here to help you live #spiritual and anf lives,0,"['angels', 'archangels', 'spiritual']",angels archangels from on high here to help you live spiritual and anf lives,angels archangels from on high here to help you live spiritual and anf lives,"['archangels', 'on', 'here', 'help', 'live', 'and', 'lives']","['archangels', 'help', 'live', 'lives']","['angels', 'archangels', 'spiritual']","['archangel', 'help', 'live', 'live']","['angel', 'archangel', 'spiritu']","['archangel', 'help', 'live', 'life']","['angel', 'archangel', 'spiritual']","[-48.10258865356445, -14.9045991897583]","[42.10148239135742, -1.0181224346160889]","[21.608600616455078, -45.8383674621582]","[-9.25829792022705, -9.490367889404297]"
3,28879,0,i am thankful for being able to remember. #thankful #positive,0,"['thankful', 'positive']",i am thankful for being able to remember thankful positive,i am thankful for being able to remember thankful positive,"['am', 'for', 'able', 'remember', 'positive']","['able', 'remember', 'positive']","['thankful', 'positive']","['abl', 'rememb', 'posit']","['thank', 'posit']","['able', 'remember', 'positive']","['thankful', 'positive']","[-45.75589370727539, 16.056238174438477]","[-40.56792449951172, 12.00460147857666]","[50.502933502197266, -9.78880786895752]","[-12.563257217407227, 14.102527618408203]"
4,17066,0,rip anton yelchin. a young good actor from the star trek reboots. for a good kind person. #ripantonyelchin #actor #antonyelchin,0,"['ripantonyelchin', 'actor', 'antonyelchin']",rip anton yelchin a young good actor from the star trek reboots for a good kind person ripantonyelchin actor antonyelchin,rip anton yelchin a young good actor from the star trek reboots for a good kind person ripantonyelchin actor antonyelchin,"['anton', 'a', 'from', 'star', 'reboots', 'a', 'good', 'kind', 'ripantonyelchin', 'antonyelchin']","['anton', 'star', 'reboots', 'good', 'kind', 'ripantonyelchin', 'antonyelchin']","['ripantonyelchin', 'actor', 'antonyelchin']","['anton', 'star', 'reboot', 'good', 'kind', 'ripantonyelchin', 'antonyelchin']","['ripantonyelchin', 'actor', 'antonyelchin']","['anton', 'star', 'reboots', 'good', 'kind', 'ripantonyelchin', 'antonyelchin']","['ripantonyelchin', 'actor', 'antonyelchin']","[-65.92171478271484, -5.722066402435303]","[-31.891355514526367, -35.983734130859375]","[7.455405235290527, 12.02939510345459]","[28.641054153442383, -15.675145149230957]"


In [64]:
# dropping all columns except for "id", "label", and "tweet"
df = df.drop(columns=['n_mentions', 'hashtags', 'without_puctioation',
       'tweet_lower', 'tweet_token', 'clean_token', 'clean_hashtags',
       'stemmed_tokens', 'stemmed_hashtags', 'lemmatized_tokens',
       'lemmatized_hashtags', 'tfidf_stemmed_tokens', 'tfidf_stemmed_hashtags',
       'tfidf_lemmatized_tokens', 'tfidf_lemmatized_hashtags'])
df.head()

Unnamed: 0,id,label,tweet
0,24090,0,best #lawofattraction #resources for #healing!...
1,15264,0,remembering to focus on the simplest happy mom...
2,19310,0,when you get as happy as your boyfriend to be ...
3,27244,0,why do you always try to make me happy? i don...
4,6633,0,omg is finally here!!! #ps4 #farcry4 #gtav #un...


In [96]:
# dropping all rows which are not labelled hate speech
df = df.loc[df['label']==1]
hatespeech.head()

Unnamed: 0,id,label,tweet
14,10468,1,@user absolutely disgusting
38,29912,1,stop trying to find where it isn't even happening! #airingofthegrievances
46,31864,1,#us refuses to veto resolution on #israel #zionism #illegalsettlements
52,15381,1,"""these are times of grave danger,with dissent recast as treason, unchained and intolerance on the march""&amp;itâ¦"
63,5817,1,"@user evolution's got a serious public relations problem, even among the well-educated. we need to de-stigmatize the factâ¦"


In [97]:
import random

pd.options.display.max_colwidth=300
hatespeech = df.sample(n = 300, random_state = 25)


In [108]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(hatespeech["tweet"])

204                                         @user feds provide inadequate #mentalhealth care for #firstnations: internal memo  #thirdworldcanada 
4728                                                                      @user @user @user @user @user he's a  #zionist. what did anyone expect?
4912                                                buffalo school districts boots trump ally over racist comments. #birdsofafeather  #resist  : 
4940                                                                              you might be a libtard if... #libtard  #sjw #liberal #politics 
3301                                                                              you might be a libtard if... #libtard  #sjw #liberal #politics 
454                                      @user #allahsoil when the market crashed in march 2000, greenspan cut interest rates even more   â¦â¦ 
1288                                                                                          don't let the #bigot skate on 

In [110]:
# hatespeech.to_csv(r'C:\Users\Mayte\Documents\Uni\Data Mining\Group Project\hatespeech.csv', index=False, sep=';')


## Results

### "Hot Topics" in the category of hatespeech:
- politics
- expressions of nationalisations
- religion
- homophobic expressions
- explicit references to sexual activities
- "Trump"
- "KKK"
- "nazi", "Hitler"
- mentions of race or skin colour

### problematic or false labels
- explicitly mentioning some key words, such as "racism" or nations or "minority groups" (e.g. jews)
- "Trump"
- "black"
- irony and negations
- "allah" in hashtags
- tweets in itself not hate speech, though their implications might be (depending on the context!), example 1:
    - "terry bradshaw &amp; george karl: at very least insensitive remarks, at most racist." (ID: 18638) As long as we are do not know about the "insensitive remarks", we do not know if this user is defending racist remarks
- links/other media needed:  "women nude anal anal t girls" (ID: 3603) in itself not sexism, however the depiction of women as purely sexual "objects" might be problematic; explicit sexual content not allowed on platform regardless
- several layers of tweets:
    - example2:
    - "@user #trump ally carl paladino said first lady should go live in a cave with a gorilla   prick" (ID: 11143)
    - user repeats a problematic statement (probelmatic: sexism, personal insult)
    - then insults the speaker of said statement
    - --> overall: standing up against hate speech, whilst using it themselves
    - example 3:
    - "#anc is quick to #shit themselves over zille's comments as , but do noting about these black racists" (ID: 22052)
    - criticism against political party; unclear if reference to "these black racists" is justified or a false accusation
- difficulty of distinguishing between tweet itself and context; or in this case between truth and false (racist) accusations:
    - example 4: "body cam reveals black students threatening white frat boys..." (ID: 7305)
- example 5: "#feminism and #equality helps #everyone it helps #men and #women  hus everyone everywhere" (ID: 5014)
- example 6: "islam against racism" (ID: 12516)
    
- often more context needed or broad knowledge of scandals and people (e.g., terry bradshaw & george karl, ID: 18638, both involved in American sports; possibly connections to the protests by some teams in kneeling down during the national anthem)
- --> sensitive to mentions of minority groups regardless of context or meaning of utterance (cf. TOXIGEN, https://arxiv.org/pdf/2203.09509v1.pdf)

### clear examples of hate speech:
- example 7:  "#paladino is one of a rare breed of  that should be shot in the street and left pissing blood in the gutter." (ID: 31790). Invitation/glorification of brutal crimes; reference probably to Carl Paladino who was involved in a scandal of leaked emails containig racism and sexism; so double layer would again be someone protesting sexism and racism by using hate speech
- example 8: "@user - i bet if the @user hc was white, he would have been fired years ago!  @user #facts" (ID: 24316); clear case of racism. narrative: immigrants/"others" as parasites who get benefits and are abusing a benevolent state, where the "original citizens" have to suffer discrimination (cf. Hart 2010); nevertheless, probably label triggered by explicitly mentioning race
- example 9: "this. #treasonoustrump, unsatisfied with his #whiteisis  #terrorism, has moved into nuclear terror in a newÃ¢Â€Â¦ " (ID: 1318)

# TO DO:  
Find most important words / word cloud of sample or all tweets with hate speech