## Import libraries

In [95]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Explore dataset

In [96]:
df = pd.read_csv('./data/hate-offensive-speech.csv', index_col=0)
df

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
25295,6,0,6,0,1,youu got wild bitches tellin you lies


## Split input/independent and output/dependent columns/features

In [97]:
init_X = df['tweet']
init_Y = df['class']
print(init_X.dtype)
print(init_Y.dtype)

object
int64


## Preprocess text
- remove trailing whitespaces
- remove non-alphanumeric characters
- lower sentences
- tokenize
- remove stop words
- lemmatize or stem word

<u>or use gensim.utils.simple_preprocess as callback of self.apply()</u>

In [98]:
import re

def view_sentence(sentences, phase='', limit=5):
  for sentence in sentences.iloc[:limit]:
    print(f'{phase} phase:\n{sentence}\n')

def simple_preprocess(text_string):
  # remove whitespaces
  temp = text_string.strip()
  # view_sentence('whitespace removal', temp)

  # match all non-alphabetic characters not being used by 
  # words then remove e.g. you're uses ' so do not remove '
  # temp = re.sub(r'\b\w*([^\w\s]|_)\w*\b|([^\w\s]|_)', lambda match: match.group(0) if match.group(1) else '', temp)
  temp = re.sub(r'[^a-zA-Z\s]*', '', temp)
  

  # turn sentences to lowercase
  temp = temp.lower()
  print(temp)
  # view_sentence('to lowercase',temp)

  # tokenize sentences and encode as well in unicode format
  words = temp.split(' ')
  # view_sentence('tokenization', temp)

  # remove stop words
  other_exclusions = ["#ff", "ff", "rt"]
  stop_words = stopwords.words('english')
  stop_words.extend(other_exclusions)
  
  words = [word for word in words if not word in stop_words]
  # view_sentence('stop word removal', temp)

  # lemmatize or stem words/tokens in each row
  # ps = PorterStemmer()
  wordnet = WordNetLemmatizer()
  words = [wordnet.lemmatize(word) for word in words]
  
  
  # encode to utf-8
  # temp = temp.apply(lambda words: [word.encode('utf-8') for word in words])

  
  return temp

def preprocess(text_string):
    print(text_string)
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

In [99]:
temp = init_X.apply(preprocess)


!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"
!!!!!!"@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!"
!!!!&#8220;@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!&#8221;
" &amp; you might not get ya bitch back &amp; thats that "
" @rhythmixx_ :hobbies inclu

In [100]:
X = temp.apply(simple_preprocess)

 rt  as a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out
 rt  boy dats coldtyga dwn bad for cuffin dat hoe in the st place
 rt  dawg rt  you ever fuck a bitch and she start to cry you be confused as shit
 rt   she look like a tranny
 rt  the shit you hear about me might be true or it might be faker than the bitch who told it to ya 
 the shit just blows meclaim you so faithful and down for somebody but still fucking with hoes 
 i can not just sit up and hate on another bitch  i got too much shit going on
 cause im tired of you big bitches coming for us skinny girls
 amp you might not get ya bitch back amp thats that 
  hobbies include fighting mariam bitch
 keeks is a bitch she curves everyone  lol i walked into a conversation like this smh
 murda gang bitch its gang land 
 so hoes that smoke are losers   yea  go on ig
 bad bitches is the only thing that i like 
 bitch get up off me 
 bitch nigga miss me with it 
 bitch plz wha