In [1]:
import pandas as pd


chunk_size = 10000
chunks = pd.read_csv("../data/HateSpeechDataset.csv", chunksize=chunk_size)

data = pd.concat(chunks, ignore_index=True)

data.head()


Unnamed: 0,Content,Label,Content_int
0,denial of normal the con be asked to comment o...,1,"[146715, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,..."
1,just by being able to tweet this insufferable ...,1,"[146715, 14, 15, 16, 17, 7, 18, 19, 20, 21, 22..."
2,that is retarded you too cute to be single tha...,1,"[146715, 28, 29, 30, 26, 31, 32, 7, 5, 33, 28,..."
3,thought of a real badass mongol style declarat...,1,"[146715, 35, 1, 24, 36, 37, 38, 39, 40, 1, 41,..."
4,afro american basho,1,"[146715, 46, 47, 48, 146714]"


In [2]:
dataLabel = data["Label"]

#number “0″ for non-hateful 
print(dataLabel[dataLabel==0].value_counts())

#number “1″ for hateful
print(dataLabel[dataLabel==1].value_counts())


Label
0    326491
Name: count, dtype: int64
Label
1    74415
Name: count, dtype: int64


In [2]:
# Data Preprocessing
import re
import spacy

class DataPreprocessing:
    def __init__(self, data):
        self.data = data
        self.nlp = spacy.load('en_core_web_sm')
    
    def missing_duplicates(self):
        self.data =  self.data.dropna()
        self.data = self.data.drop_duplicates()
        return self.data
    
    def clean_text(self, text):
        # Convertir des lettres en minuscules
        text = text.lower()
        
        #Supprimer les liens
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        
        # Supprimer les mentions et les hashtags
        text = re.sub(r'\@\w+|\#\w+', '', text)
        
        # Supprimer les caractères spéciaux et les chiffres
        text = re.sub(r'[^A-Za-z\s]', '', text)
        
        # Supprimer les espaces multiples
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def spacy_preprocess(self, text):
        doc = self.nlp(text)
        tokens = [
            token.lemma_ for token in doc
            if not token.is_stop      # Nous supprimons les mots supplémentaires
            and not token.is_punct    # Nous supprimons les signes de ponctuation
            and not token.is_digit    # Nous supprimons les chiffres
        ]
        return " ".join(tokens)


In [3]:
dataPreprocessing = DataPreprocessing(data)
data = dataPreprocessing.missing_duplicates()
data["Content"] = data["Content"].apply(dataPreprocessing.clean_text)
data["Content"] = data["Content"].apply(dataPreprocessing.spacy_preprocess)


In [5]:
data.to_csv("../data/cleaned_hate_speech.csv", index=False)