In [2]:
import re
import nltk
import pandas as pd 
from nltk.corpus import stopwords # to remove the stopwords
import demoji # demoji is used to remove pure emojis

### Twitter user detection (@xxx) test

In [14]:
# sample text
sample_text = [
    "this @hotstr bitch is bullying the hell out of @.ki.is23",
    "@332df you want to help @_7sad",
    "this @ is dumb things",
]

# regex to detect the username (@xxx)
regex = r"@([^ ]+)"

# before preprocess
for x in sample_text:
    print(x)

this @hotstr bitch is bullying the hell out of @.ki.is23
@332df you want to help @_7sad
this @ is dumb things


In [15]:
# execute the process
for idx, string in enumerate(sample_text):
    sample_text[idx] = re.sub(regex, "user", string)
    
# after preprocess
for x in sample_text:
    print(x)

this user bitch is bullying the hell out of user
user you want to help user
this @ is dumb things


### Emoji remove test
2 cases:
- if the emoji is pure emoji
- the emoji & ect is still on html entities like &#128584; or &#8217;

Case 1:

if the emoji is pure emoji

In [17]:
# this is when the emoji is pure emoji
sample_text = [
    "this @hotstr bitch is bullying some 🏳️‍🌈 kids 💀",
    "🎉🎉🎉 happy birthday @_7sad",
    "why my life is sad🥹, maybe i should rethink my decision🤔",
]

# before preprocess
for x in sample_text:
    print(x)

this @hotstr bitch is bullying some 🏳️‍🌈 kids 💀
🎉🎉🎉 happy birthday @_7sad
why my life is sad🥹, maybe i should rethink my decision🤔


In [18]:
# execute the emoji remover
for idx, text in enumerate(sample_text):
    sample_text[idx] = demoji.replace(text, "")

# after emoji is removed
for x in sample_text:
    print(x)

this @hotstr bitch is bullying some  kids 
 happy birthday @_7sad
why my life is sad🥹, maybe i should rethink my decision


Case 2:

The emoji is in the form of html entities, we can use regex to remove it

In [52]:
entities_text = 'got ya bitch tip toeing on my hardwood floors "" &#128514; http://t.co/cOU2WQ5L4q"'
entity_regex = r"&[^\s;]+;"
text = re.sub(entity_regex, "", entities_text)
print(text)

# after this you can use the demoji library to remove the emojis.

got ya bitch tip toeing on my hardwood floors ""  http://t.co/cOU2WQ5L4q"


### Url deletion test

In [37]:
text = "this a good resource https://www.google.com/search?client=firefox-b-d&q=k+means+clusterin, and i love it"
sample_text = [
    "this is some sketchy links tokopedia.com/cart idk if safe or n lol",
    "http://t.co/cOU2WQ5L4q, this is some good shit",
    "hi babe join my class in bit.ly/kemanggisansus99"
]
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

## single test
# text = re.sub(url_regex, '', text)
# print(text)

# before preprocess
for x in sample_text:
    print(x)

this is some sketchy links tokopedia.com/cart idk if safe or n lol
http://t.co/cOU2WQ5L4q, this is some good shit
hi babe join my class in bit.ly/kemanggisansus99


In [38]:
for idx, text in enumerate(sample_text):
    sample_text[idx] = re.sub(url_regex, '', text)

# after preprocess
for x in sample_text:
    print(x)

this is some sketchy links  idk if safe or n lol
, this is some good shit
hi babe join my class in 


### Removing the trailing noises

In [29]:
sample_text = [
    "...Son of a bitch took my Tic Tacs.",
    '"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"',
]

## if you want more 
# sample2 = [
#     'got ya bitch tip toeing on my hardwood floors ""  http://t.co/cOU2WQ5L4q"',
#     'pussy is a powerful drug @juanwmv   #HappyHumpDay http://t.co/R8jsymiB5b',
#     '...Son of a bitch took my Tic Tacs.',
#     '"@2015seniorprobs: I probably wouldn’t mind school as much if we didn’t have to deal with bitch ass teachers"". Retweet',
#     '"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"',
#     "@AutoWorId: Hennessey Venom GT  http://t.co/i8eGMnKaJ9 that's one sexy bitch",    
# ]

In [30]:
for x in sample_text:
    ans = x.replace('"', '')
    ans = ans.replace("'", '')
    ans = ans.replace("..", '')
    print(ans)
    

.Son of a bitch took my Tic Tacs.
All I wanna do is get money and fuck model bitches! - Russell Simmons


### Tokenize the words and padding
Tokenize:
- changing text to number (idk why, but maybe to accelerate the calculation)

Padding:
- lstm input need to be consistent

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
# sample text for tokenization
sample_text = [
    "user woman shouldnt complain cleaning house . man always take trash .",
    "user boy dats cold.tyga dwn bad cuffin dat hoe 1st place",
    "user Dawg user ever fuck bitch start cry ? confused shit",
    "user user look like tranny",
    "user shit hear might true might faker bitch told ya",
    "user shit blows meclaim faithful somebody still fucking hoes",
    "user sit HATE another bitch got much shit going",
]

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sample_text)
tokenized_words = tokenizer.texts_to_sequences(sample_text)
vocab_size = len(tokenizer.word_index) + 1

In [22]:
print(vocab_size)

52


In [18]:
## i forgot why i put this here, but i think its to prove something
# print(len(tokenized_words))
# print(len(sample_text))

for i in range(len(tokenized_words)):
    print("token len = ", len(tokenized_words[i]))
    print(f"real       = {sample_text[i]}\ntokenized  = {tokenized_words[i]}\n")

token len =  10
real       = user woman shouldnt complain cleaning house . man always take trash .
tokenized  = [1, 5, 6, 7, 8, 9, 10, 11, 12, 13]

token len =  12
real       = user boy dats cold.tyga dwn bad cuffin dat hoe 1st place
tokenized  = [1, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

token len =  10
real       = user Dawg user ever fuck bitch start cry ? confused shit
tokenized  = [1, 25, 1, 26, 27, 3, 28, 29, 30, 2]

token len =  5
real       = user user look like tranny
tokenized  = [1, 1, 31, 32, 33]

token len =  10
real       = user shit hear might true might faker bitch told ya
tokenized  = [1, 2, 34, 4, 35, 4, 36, 3, 37, 38]

token len =  9
real       = user shit blows meclaim faithful somebody still fucking hoes
tokenized  = [1, 2, 39, 40, 41, 42, 43, 44, 45]

token len =  9
real       = user sit HATE another bitch got much shit going
tokenized  = [1, 46, 47, 48, 3, 49, 50, 2, 51]



In [19]:
# padding input uses the tokenized word
# Padding
max_length = max(len(seq) for seq in tokenized_words)
padded_sequences = pad_sequences(tokenized_words, maxlen=max_length)
max_sequence_length = max(len(seq) for seq in tokenized_words)

In [21]:
for x in padded_sequences:
    print(x)

[ 0  0  1  5  6  7  8  9 10 11 12 13]
[ 1 14 15 16 17 18 19 20 21 22 23 24]
[ 0  0  1 25  1 26 27  3 28 29 30  2]
[ 0  0  0  0  0  0  0  1  1 31 32 33]
[ 0  0  1  2 34  4 35  4 36  3 37 38]
[ 0  0  0  1  2 39 40 41 42 43 44 45]
[ 0  0  0  1 46 47 48  3 49 50  2 51]


### Testing all
1. change the user tags
2. remove all html entity
3. remove urls
4. remove stopwords
5. remove trailing
6. padding and tokenizing

In [3]:
test_list = [
    '"""&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;',
    "pussy is a powerful drug @juanwmv "" &#128517; #HappyHumpDay http://t.co/R8jsymiB5b",
    '"""@Almightywayne__: @JetsAndASwisher @Gook____ bitch fuck u http://t.co/pXmGA68NC1"" maybe youll get better. Just http://t.co/TPreVwfq0S""',
    '"@2015seniorprobs: I probably wouldn&#8217;t mind school as much if we didn&#8217;t have to deal with bitch ass teachers"". Retweet',
    '"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"',
    "@BestProAdvice: The facts on tattoos...tattoo http://t.co/ZwnbhpDZ8e"" he's a pussy with not tattooing them nipples",
]

# before
for x in test_list:
    print(x)
# to contain the clean shit
clean = []

"""&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;
pussy is a powerful drug @juanwmv  &#128517; #HappyHumpDay http://t.co/R8jsymiB5b
"""@Almightywayne__: @JetsAndASwisher @Gook____ bitch fuck u http://t.co/pXmGA68NC1"" maybe youll get better. Just http://t.co/TPreVwfq0S""
"@2015seniorprobs: I probably wouldn&#8217;t mind school as much if we didn&#8217;t have to deal with bitch ass teachers"". Retweet
"""..All I wanna do is get money and fuck model bitches!"" - Russell Simmons"
@BestProAdvice: The facts on tattoos...tattoo http://t.co/ZwnbhpDZ8e he's a pussy with not tattooing them nipples


In [53]:
stop_words = set(stopwords.words('english'))
# add rt to remove retweet in dataset (noise)
stop_words.add("rt")

def remove_emojis(raw_text):
    entity_regex = r"&[^\s;]+;"
    text = re.sub(entity_regex, "", raw_text)
    return text

def remove_stopwords(raw_text):
    tokenize = nltk.word_tokenize(raw_text)
    text = [word for word in tokenize if not word.lower() in stop_words]
    text = " ".join(text)

    return text

def remove_url(raw_text):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, '', raw_text)

    return text

def change_user(raw_text):
    regex = r"@([^ ]+)"
    text = re.sub(regex, "user", raw_text)

    return text

def remove_trailing_noise(raw_text):
    text = raw_text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace("!", '')
    text = text.replace("`", '')
    # text = text.replace("”", '')
    text = text.replace("..", '')

    return text

def remove_noise(datas):
    clean = []
    # change the @xxx into "user"
    clean = [change_user(text) for text in datas]
    # remove emojis (specifically unicode emojis)
    clean = [remove_emojis(text) for text in clean]
    # remove urls
    clean = [remove_url(text) for text in clean]
    # remove trailing stuff
    clean = [remove_trailing_noise(text) for text in clean]
    # remove stopwords
    clean = [remove_stopwords(text) for text in clean]
    return clean
        

In [54]:
df = pd.read_csv("./labeled_data.csv")
raw_dataset = df['tweet']
raw_dataset = list(raw_dataset[:20])

for x in raw_dataset:
    print(x)

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
!!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"
!!!!!!"@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!"
!!!!&#8220;@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!&#8221;
" &amp; you might not get ya bitch back &amp; thats that "
" @rhythmixx_ :hobbies inclu

In [55]:
good_tweet = remove_noise(raw_dataset)
for x in good_tweet:
    print(x)

user woman shouldnt complain cleaning house . man always take trash .
user boy dats cold.tyga dwn bad cuffin dat hoe 1st place
user Dawg user ever fuck bitch start cry ? confused shit
user user look like tranny
user shit hear might true might faker bitch told ya
user shit blows meclaim faithful somebody still fucking hoes
user sit HATE another bitch got much shit going
user cause Im tired big bitches coming us skinny girls
might get ya bitch back thats
user : hobbies include : fighting Mariam bitch
Keeks bitch curves everyone lol walked conversation like . Smh
Murda Gang bitch Gang Land
hoes smoke losers ? yea . go IG
bad bitches thing like
bitch get
bitch nigga miss
bitch plz whatever
bitch love
bitches get cut everyday B
black bottle bad bitch
