# Detector de Discurso de odio

In [23]:
#hate Speech with twitter comments

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [24]:
import re
import nltk

from nltk.util import pr
from nltk.corpus import stopwords

import string

# Configuraciones

In [25]:
#config
nltk.download( 'stopwords' ) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aguil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
stemmer = nltk.SnowballStemmer( 'english' )
stopword = set( stopwords.words( 'english' ) )

# Desarrollo

## 1 Importacion de Datos

In [27]:
dataTrain = pd.read_csv('./DATA/Training/DataTrain.csv') 
dataTrain

Unnamed: 0,text,label
0,RT @ChrisWarcraft: @freebsdgirl Did you say [T...,not-abuse
1,"#MoreAcceptableThors: No Foreplay Thor, Garlic...",not-abuse
2,Feelings about the person involved need to be ...,not-abuse
3,"Oh, hell. Is Blizzard proxying tweets instead ...",not-abuse
4,Drasko talks a lot for a bloke who's instant r...,not-abuse
...,...,...
38873,No matter how hard you try to be rude mf`s the...,abuse
38874,Guys now and days act like they don't care bec...,abuse
38875,Truuuuuuuuuuu facts both then niggaz is pussy ...,abuse
38876,Tell a itty bitty bitch pipe down,abuse


## 2 Limpieza

In [28]:
def clean( text ):
    text = str( text ).lower()
    text = re.sub( '\[.*?\]','', text )    
    text = re.sub( 'https?://\S+|www\.\S+', '', text )    
    text = re.sub( '<.*?>+', '', text)
    
    text = re.sub( '[%s]' % re.escape( string.punctuation ), '', text )
    text = re.sub( '\n', '', text )
    text = re.sub( '\w*\d\w*', '', text)
    text = re.sub( '\w*\d\w*', '', text)
    
    #text = re.sub('rt*', '', text)
    #text = re.sub('  *', '', text)
    
    text = [ word for word in text.split(' ') if word not in stopword ]
    text = " ".join( text )
    text = [ stemmer.stem( word ) for word in text.split(' ') ]
    text = " ".join( text )
    
    return text

#aplicando transformación

dataTrain[ 'text' ] = dataTrain[ 'text' ].apply(clean)
dataTrain.head()

Unnamed: 0,text,label
0,rt chriswarcraft freebsdgirl say,not-abuse
1,moreacceptablethor foreplay thor garlic breath...,not-abuse
2,feel person involv need separ real issu advoca...,not-abuse
3,oh hell blizzard proxi tweet instead send twee...,not-abuse
4,drasko talk lot bloke whos instant restaur wen...,not-abuse


In [29]:
dataTrain[ 'Int label' ] = dataTrain[ 'label' ].map({ 'not-abuse':1 , 'abuse': 0 })
dataTrain.head(10)

Unnamed: 0,text,label,Int label
0,rt chriswarcraft freebsdgirl say,not-abuse,1
1,moreacceptablethor foreplay thor garlic breath...,not-abuse,1
2,feel person involv need separ real issu advoca...,not-abuse,1
3,oh hell blizzard proxi tweet instead send twee...,not-abuse,1
4,drasko talk lot bloke whos instant restaur wen...,not-abuse,1
5,like serious helen thoma obama wasnt even tech...,not-abuse,1
6,albertinho cant tell sarcasm clueless,not-abuse,1
7,hope sheri emili dont go sudden elimin pleas mkr,not-abuse,1
8,guess one includ entir comment thread,not-abuse,1
9,serious score uh hate kat andr mkr,not-abuse,1


## 3 Entrenamiento del modelo

In [30]:
#training
x = np.array( dataTrain[ 'text' ] )
y = np.array( dataTrain[ 'label' ] )

cv = CountVectorizer()
x = cv.fit_transform( x )

xt, xp, yt, xp = train_test_split( x, y, test_size=0.2, random_state=50 )

clf = DecisionTreeClassifier()
clf.fit( xt, yt )

DecisionTreeClassifier()

In [31]:
#test information
test1 = "i will kill you"
test2 = "you are stupid"
test3 = "nothing personal"
test4 = "You a fucking crack man!"
test5 = "Fuck you"

df = cv.transform([test5]).toarray()
print(clf.predict(df))

['abuse']


## 4 Testeando el modelo con otro set

In [32]:
dataTest2 = pd.read_csv( './DATA/Testing/twitter_data.csv' ) 
dataTest2.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [33]:
dataTest2[ 'labels' ] = dataTest2[ 'class' ].map({0:'abuse',
                                   1: 'offensive languaje',
                                   2: 'not-abuse'})
dataTest2

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,not-abuse
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,offensive languaje
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,offensive languaje
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,offensive languaje
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,offensive languaje
...,...,...,...,...,...,...,...,...
19993,20435,3,0,3,0,1,RT @seansteiger31: Melo will be a bitch if he ...,offensive languaje
19994,20436,3,0,2,1,1,RT @seansteiger31: Sometimes I listen &amp; re...,offensive languaje
19995,20437,3,0,3,0,1,RT @semper_fiibitch: Farra from teen mom has 2...,offensive languaje
19996,20438,3,0,0,3,2,RT @sepinwall: Day 2 of FXX&#8217;s Simpsons m...,not-abuse


### 4.1 Limpiando datos de testeo

In [34]:
dataTest2= dataTest2.query('labels == "abuse" or labels == "not-abuse"');
dataTest2.index = pd.RangeIndex(len(dataTest2.index))
dataTest2.index = range(len(dataTest2.index))
dataTest2[ 'B_labels' ] = dataTest2[ 'labels' ].map({'abuse':0, 'not-abuse':1})
#_____________________________________________________
dataTest2[ 'tweet' ] = dataTest2[ 'tweet' ].apply(clean)
#_____________________________________________________

dataTest2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTest2[ 'B_labels' ] = dataTest2[ 'labels' ].map({'abuse':0, 'not-abuse':1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTest2[ 'tweet' ] = dataTest2[ 'tweet' ].apply(clean)


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels,B_labels
0,0,3,0,0,3,2,rt mayasolov woman shouldnt complain clean ho...,not-abuse,1
1,40,3,0,1,2,2,momma said pussi cat insid doghous,not-abuse,1
2,63,3,0,0,3,2,simplyaddictedtoguy woof woof hot scalli lad,not-abuse,1
3,66,3,0,1,2,2,allaboutmanfeet woof woof hot sole,not-abuse,1
4,67,3,0,1,2,2,allyhaaaaa lemmi eat oreo amp dish one oreo lol,not-abuse,1
...,...,...,...,...,...,...,...,...,...
4605,20422,3,3,0,0,0,rt sblmnlcrmnl woodslamar shut nigger ass lamar,abuse,0
4606,20428,3,0,0,3,2,rt scottiken settl ding dong far superior twinki,not-abuse,1
4607,20429,3,0,0,3,2,rt scottvtraci abuyazidmuawiya congratul rein...,not-abuse,1
4608,20433,3,0,1,2,2,rt seanmdav sinc know your wonder blakehounshe...,not-abuse,1


### 4.2 Predicciones

In [35]:
data_result=[]
sentencesToPredict = dataTest2['tweet']
for sentence in sentencesToPredict:
    var = cv.transform([sentence])
    prediction = str(clf.predict(var)) #numpy.array type to string type
    data_result.append( [ sentence, prediction[2:-2]] ) # "['result']" ----> "result"

In [36]:
results2 = pd.DataFrame(data_result,columns=['Sentence', 'Clasification'])
results2

Unnamed: 0,Sentence,Clasification
0,rt mayasolov woman shouldnt complain clean ho...,not-abuse
1,momma said pussi cat insid doghous,not-abuse
2,simplyaddictedtoguy woof woof hot scalli lad,not-abuse
3,allaboutmanfeet woof woof hot sole,not-abuse
4,allyhaaaaa lemmi eat oreo amp dish one oreo lol,not-abuse
...,...,...
4605,rt sblmnlcrmnl woodslamar shut nigger ass lamar,abuse
4606,rt scottiken settl ding dong far superior twinki,not-abuse
4607,rt scottvtraci abuyazidmuawiya congratul rein...,not-abuse
4608,rt seanmdav sinc know your wonder blakehounshe...,not-abuse


In [37]:
results2[ 'B_Result' ] = results2[ 'Clasification' ].map({ 'not-abuse':1 , 'abuse': 0 })
results2

Unnamed: 0,Sentence,Clasification,B_Result
0,rt mayasolov woman shouldnt complain clean ho...,not-abuse,1
1,momma said pussi cat insid doghous,not-abuse,1
2,simplyaddictedtoguy woof woof hot scalli lad,not-abuse,1
3,allaboutmanfeet woof woof hot sole,not-abuse,1
4,allyhaaaaa lemmi eat oreo amp dish one oreo lol,not-abuse,1
...,...,...,...
4605,rt sblmnlcrmnl woodslamar shut nigger ass lamar,abuse,0
4606,rt scottiken settl ding dong far superior twinki,not-abuse,1
4607,rt scottvtraci abuyazidmuawiya congratul rein...,not-abuse,1
4608,rt seanmdav sinc know your wonder blakehounshe...,not-abuse,1


### 4.3  Calculo de accuracy, precision y recall

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [39]:
y_pred = results2['B_Result']
y_true = ['B_labels']

print('Accuracy score against', y_true,' is: {:.2f}'
      .format(accuracy_score( dataTest2[y_true], y_pred, normalize=True )*100),'%' )

print('Precision score against', y_true,' is: {:.2f}'
      .format(precision_score( dataTest2[y_true], y_pred )*100),'%' )

print('Recall score against', y_true,' is: {:.2f}'
      .format(recall_score( dataTest2[y_true], y_pred )*100),'%' )

Accuracy score against ['B_labels']  is: 94.69 %
Precision score against ['B_labels']  is: 98.23 %
Recall score against ['B_labels']  is: 94.51 %


# Obteniendo matriz de confusión

In [40]:
from sklearn.metrics import confusion_matrix

In [41]:
cdf=pd.DataFrame(confusion_matrix(dataTest2[y_true], y_pred), columns=['A','NA'], index=['A','NA'])
cdf

Unnamed: 0,A,NA
A,1145,58
,187,3220


In [42]:
tn, fp, fn, tp = confusion_matrix(dataTest2[y_true], y_pred).ravel()
(tn, fp, fn, tp)

(1145, 58, 187, 3220)