# Detector de Discurso de odio

In [1]:
#hate Speech with twitter comments

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
import re
import nltk

from nltk.util import pr
from nltk.corpus import stopwords

import string

# Configuraciones

In [3]:
#config
nltk.download( 'stopwords' ) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aguil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stemmer = nltk.SnowballStemmer( 'english' )
stopword = set( stopwords.words( 'english' ) )

# Desarrollo

## 1 Importacion de Datos

In [5]:
dataTrain = pd.read_csv('./DATA/Training/DataTrain.csv') 
dataTrain

Unnamed: 0,text,label
0,RT @ChrisWarcraft: @freebsdgirl Did you say [T...,not-abuse
1,"#MoreAcceptableThors: No Foreplay Thor, Garlic...",not-abuse
2,Feelings about the person involved need to be ...,not-abuse
3,"Oh, hell. Is Blizzard proxying tweets instead ...",not-abuse
4,Drasko talks a lot for a bloke who's instant r...,not-abuse
...,...,...
38873,No matter how hard you try to be rude mf`s the...,abuse
38874,Guys now and days act like they don't care bec...,abuse
38875,Truuuuuuuuuuu facts both then niggaz is pussy ...,abuse
38876,Tell a itty bitty bitch pipe down,abuse


## 2 Limpieza

In [6]:
def clean( text ):
    text = str( text ).lower()
    text = re.sub( '\[.*?\]','', text )    
    text = re.sub( 'https?://\S+|www\.\S+', '', text )    
    text = re.sub( '<.*?>+', '', text)
    
    text = re.sub( '[%s]' % re.escape( string.punctuation ), '', text )
    text = re.sub( '\n', '', text )
    text = re.sub( '\w*\d\w*', '', text)
    text = re.sub( '\w*\d\w*', '', text)
    
    #text = re.sub('rt*', '', text)
    #text = re.sub('  *', '', text)
    
    text = [ word for word in text.split(' ') if word not in stopword ]
    text = " ".join( text )
    text = [ stemmer.stem( word ) for word in text.split(' ') ]
    text = " ".join( text )
    
    return text

#aplicando transformación

dataTrain[ 'text' ] = dataTrain[ 'text' ].apply(clean)
dataTrain.head()

Unnamed: 0,text,label
0,rt chriswarcraft freebsdgirl say,not-abuse
1,moreacceptablethor foreplay thor garlic breath...,not-abuse
2,feel person involv need separ real issu advoca...,not-abuse
3,oh hell blizzard proxi tweet instead send twee...,not-abuse
4,drasko talk lot bloke whos instant restaur wen...,not-abuse


In [7]:
dataTrain[ 'Int label' ] = dataTrain[ 'label' ].map({ 'not-abuse':1 , 'abuse': 0 })
dataTrain.head(10)

Unnamed: 0,text,label,Int label
0,rt chriswarcraft freebsdgirl say,not-abuse,1
1,moreacceptablethor foreplay thor garlic breath...,not-abuse,1
2,feel person involv need separ real issu advoca...,not-abuse,1
3,oh hell blizzard proxi tweet instead send twee...,not-abuse,1
4,drasko talk lot bloke whos instant restaur wen...,not-abuse,1
5,like serious helen thoma obama wasnt even tech...,not-abuse,1
6,albertinho cant tell sarcasm clueless,not-abuse,1
7,hope sheri emili dont go sudden elimin pleas mkr,not-abuse,1
8,guess one includ entir comment thread,not-abuse,1
9,serious score uh hate kat andr mkr,not-abuse,1


## 3 Entrenamiento del modelo

In [8]:
#training
x = np.array( dataTrain[ 'text' ] )
y = np.array( dataTrain[ 'label' ] )

cv = CountVectorizer()
x = cv.fit_transform( x )

xt, xp, yt, xp = train_test_split( x, y, test_size=0.2, random_state=50 )

clf = DecisionTreeClassifier()
clf.fit( xt, yt )

DecisionTreeClassifier()

In [9]:
#test information
test1 = "i will kill you"
test2 = "you are stupid"
test3 = "nothing personal"
test4 = "You a fucking crack man!"
test5 = "Fuck you"

df = cv.transform([test5]).toarray()
print(clf.predict(df))

['abuse']


## 4 Testeando el modelo

In [10]:
dataTest = pd.read_csv( './DATA/Testing/Testingdata.csv' ) 
dataTest.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,2,1,1,RT @shaelynspacyyy: &#8220;@UglyAssDerrick: @s...
1,3,0,1,2,2,RT @shakiraevanss: Criticize Amanda for saying...
2,3,0,3,0,1,RT @shananigans__: 50 shades of shut your bitc...
3,3,0,3,0,1,RT @shanequabullock: Meet me and my bitches @ ...
4,3,0,3,0,1,RT @shantelb_: &#8220;@WeAre_XCI: I know bitch...


In [11]:
dataTest[ 'labels' ] = dataTest[ 'class' ].map({0:'abuse',
                                   1: 'offensive languaje',
                                   2: 'not-abuse'})
dataTest

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,3,0,2,1,1,RT @shaelynspacyyy: &#8220;@UglyAssDerrick: @s...,offensive languaje
1,3,0,1,2,2,RT @shakiraevanss: Criticize Amanda for saying...,not-abuse
2,3,0,3,0,1,RT @shananigans__: 50 shades of shut your bitc...,offensive languaje
3,3,0,3,0,1,RT @shanequabullock: Meet me and my bitches @ ...,offensive languaje
4,3,0,3,0,1,RT @shantelb_: &#8220;@WeAre_XCI: I know bitch...,offensive languaje
...,...,...,...,...,...,...,...
4780,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,offensive languaje
4781,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...",not-abuse
4782,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,offensive languaje
4783,6,0,6,0,1,youu got wild bitches tellin you lies,offensive languaje


### 4.1 Limpiando datos de testeo

In [12]:
dataTest= dataTest.query('labels == "abuse" or labels == "not-abuse"');
dataTest.index = pd.RangeIndex(len(dataTest.index))
dataTest.index = range(len(dataTest.index))
dataTest[ 'B_labels' ] = dataTest[ 'labels' ].map({'abuse':0, 'not-abuse':1})
#_____________________________________________________
dataTest[ 'tweet' ] = dataTest[ 'tweet' ].apply(clean)
#_____________________________________________________


dataTest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTest[ 'B_labels' ] = dataTest[ 'labels' ].map({'abuse':0, 'not-abuse':1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTest[ 'tweet' ] = dataTest[ 'tweet' ].apply(clean)


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels,B_labels
0,3,0,1,2,2,rt shakiraevanss critic amanda say n word sure...,not-abuse,1
1,3,3,0,0,0,rt fuck islam state kill,abuse,0
2,3,0,0,3,2,rt sickipedia drove daughter guinea pig vet mo...,not-abuse,1
3,3,0,0,3,2,rt simplyforkick quan help everyon random ques...,not-abuse,1
4,3,0,0,3,2,rt skullmand call dark cold damn charli brown,not-abuse,1
...,...,...,...,...,...,...,...,...
978,3,0,1,2,2,know say earli bird get worm put gummi worm mo...,not-abuse,1
979,3,3,0,0,0,your nigger,abuse,0
980,3,2,1,0,0,your retard hope get type diabet die sugar ru...,abuse,0
981,3,0,1,2,2,youv gone broke wrong heart babi drove redneck...,not-abuse,1


### 4.2  Predicciones

In [13]:
data_result=[]
sentencesToPredict = dataTest['tweet']
for sentence in sentencesToPredict:
    var = cv.transform([sentence])
    prediction = str(clf.predict(var)) #numpy.array type to string type
    data_result.append( [ sentence, prediction[2:-2]] ) # "['result']" ----> "result"

In [14]:
results = pd.DataFrame(data_result,columns=['Sentence', 'Clasification'])
results

Unnamed: 0,Sentence,Clasification
0,rt shakiraevanss critic amanda say n word sure...,not-abuse
1,rt fuck islam state kill,not-abuse
2,rt sickipedia drove daughter guinea pig vet mo...,not-abuse
3,rt simplyforkick quan help everyon random ques...,not-abuse
4,rt skullmand call dark cold damn charli brown,not-abuse
...,...,...
978,know say earli bird get worm put gummi worm mo...,not-abuse
979,your nigger,abuse
980,your retard hope get type diabet die sugar ru...,abuse
981,youv gone broke wrong heart babi drove redneck...,not-abuse


In [15]:
results[ 'B_Result' ] = results[ 'Clasification' ].map({ 'not-abuse':1 , 'abuse': 0 })
results

Unnamed: 0,Sentence,Clasification,B_Result
0,rt shakiraevanss critic amanda say n word sure...,not-abuse,1
1,rt fuck islam state kill,not-abuse,1
2,rt sickipedia drove daughter guinea pig vet mo...,not-abuse,1
3,rt simplyforkick quan help everyon random ques...,not-abuse,1
4,rt skullmand call dark cold damn charli brown,not-abuse,1
...,...,...,...
978,know say earli bird get worm put gummi worm mo...,not-abuse,1
979,your nigger,abuse,0
980,your retard hope get type diabet die sugar ru...,abuse,0
981,youv gone broke wrong heart babi drove redneck...,not-abuse,1


### 4.3  Calculo de accuracy, precision y recall

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [17]:
y_pred = results['B_Result']
y_true = ['B_labels']

print('Accuracy score against', y_true,' is: {:.2f}'
      .format(accuracy_score( dataTest[y_true], y_pred, normalize=True )*100),'%' )

print('Precision score against', y_true,' is: {:.2f}'
      .format(precision_score( dataTest[y_true], y_pred )*100),'%' )

print('Recall score against', y_true,' is: {:.2f}'
      .format(recall_score( dataTest[y_true], y_pred )*100),'%' )

Accuracy score against ['B_labels']  is: 95.12 %
Precision score against ['B_labels']  is: 98.63 %
Recall score against ['B_labels']  is: 94.97 %


# Obteniendo matriz de confusión

In [18]:
from sklearn.metrics import confusion_matrix

In [21]:
cdf=pd.DataFrame(confusion_matrix(dataTest[y_true], y_pred), columns=['A','NA'], index=['A','NA'])
cdf

Unnamed: 0,A,NA
A,217,10
,38,718
