# SVM

## Imports

In [103]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [104]:
from google.colab import files
import pandas as pd
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import svm
from sklearn.metrics import average_precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

## Preparing Data

In [105]:
# Pre-process function

nDAY = r'(?:[0-3]?\d)'  # day can be from 1 to 31 with a leading zero 
nMNTH = r'(?:11|12|10|0?[1-9])' # month can be 1 to 12 with a leading zero
nYR = r'(?:(?:19|20)\d\d)'  # I've restricted the year to being in 20th or 21st century on the basis 
                            # that people doon't generally use all number format for old dates, but write them out 
nDELIM = r'(?:[\/\-\._])?'  # 
NUM_DATE = f"""
    (?P<num_date>
        (?:^|\D) # new bit here
        (?:
        # YYYY-MM-DD
        (?:{nYR}(?P<delim1>[\/\-\._]?){nMNTH}(?P=delim1){nDAY})
        |
        # YYYY-DD-MM
        (?:{nYR}(?P<delim2>[\/\-\._]?){nDAY}(?P=delim2){nMNTH})
        |
        # DD-MM-YYYY
        (?:{nDAY}(?P<delim3>[\/\-\._]?){nMNTH}(?P=delim3){nYR})
        |
        # MM-DD-YYYY
        (?:{nMNTH}(?P<delim4>[\/\-\._]?){nDAY}(?P=delim4){nYR})
        )
        (?:\D|$) # new bit here
    )"""
DAY = r"""
(?:
    # search 1st 2nd 3rd etc, or first second third
    (?:[23]?1st|2{1,2}nd|\d{1,2}th|2?3rd|first|second|third|fourth|fifth|sixth|seventh|eighth|nineth)
    |
    # or just a number, but without a leading zero
    (?:[123]?\d)
)"""
MONTH = r'(?:january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)'
YEAR = r"""(?:(?:[12]?\d|')?\d\d)"""
DELIM = r'(?:\s*(?:[\s\.\-\\/,]|(?:of))\s*)'

YEAR_4D = r"""(?:[12]\d\d\d)"""
DATE_PATTERN = f"""(?P<wordy_date>
    # non word character or start of string
    (?:^|\W)
        (?:
            # match various combinations of year month and day 
            (?:
                # 4 digit year
                (?:{YEAR_4D}{DELIM})?
                    (?:
                    # Day - Month
                    (?:{DAY}{DELIM}{MONTH})
                    |
                    # Month - Day
                    (?:{MONTH}{DELIM}{DAY})
                    )
                # 2 or 4 digit year
                (?:{DELIM}{YEAR})?
            )
            |
            # Month - Year (2 or 3 digit)
            (?:{MONTH}{DELIM}{YEAR})
        )
    # non-word character or end of string
    (?:$|\W)
)"""

TIME = r"""(?:
(?:
# first number should be 0 - 59 with optional leading zero.
[012345]?\d
# second number is the same following a colon
:[012345]\d
)
# next we add our optional seconds number in the same format
(?::[012345]\d)?
# and finally add optional am or pm possibly with . and spaces
(?:\s*(?:a|p)\.?m\.?)?
)"""

COMBINED = f"""(?P<combined>
    (?:
        # time followed by date, or date followed by time
        {TIME}?{DATE_PATTERN}{TIME}?
        |
        # or as above but with the numeric version of the date
        {TIME}?{NUM_DATE}{TIME}?
    ) 
    # or a time on its own
    |
    (?:{TIME})
)"""
myDate = re.compile(COMBINED, re.IGNORECASE | re.VERBOSE | re.UNICODE)


# Pre-Processing before Data Generation
def preProcess(X):  
  X = X.split()
  X_removeWebsites = [x for x in X if not (".com" in x)]
  X_removeBracketedValues = [x for x in X_removeWebsites if not (x.startswith("[") or x.startswith('<'))]
  X_removeAndSymbolWords = [x for x in X_removeBracketedValues if not (x.startswith("&#") or ("$#" in x))]
  X_removeAtSymbolWords = [x for x in X_removeAndSymbolWords if not (x.startswith("@") or x.startswith('"@') or x.startswith('.@') or x.startswith('(@'))]  
  X_removeHyperlinks = [x for x in X_removeAtSymbolWords if not (x.startswith("http") or x.startswith("https") or ("http" in x) or ("https" in x))]  
  X_removeRT = [x for x in X_removeHyperlinks if not x=="RT"]
  X_removeHash = [x for x in X_removeRT if not x.startswith("#")]
  X_removeAmpersand = [x for x in X_removeHash if not x.startswith("&")]
  X_removeDates = [ x for x in X_removeAmpersand if not myDate.match(x)]
  X_removeNumbers = [ x for x in X_removeDates if not x.isnumeric()]
  ret = ' '.join(X_removeNumbers)
  return ret

In [166]:
# ORIGINAL DATA SET
# -------------------------------------------------------------------------------------------------------------------
# Get original data frame
dfOriginal = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/labeled_data_trunc.csv")
dfOriginal = dfOriginal[dfOriginal['class']!=1].reset_index(drop=True)

# Preprocess Original data frame
dfOriginal['tweet'] = dfOriginal['tweet'].apply(preProcess)



# GENERATED DATA SET
# -------------------------------------------------------------------------------------------------------------------
# Get preProcessed data
dfGEN = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/Datasets/preProcessed_Final_2000.csv")
dfGEN = dfGEN[["class", "tweet"]]

# Get pre-postProcessed data
# dfGEN = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/Datasets/prePostProcessed_Final_2000.csv")
# dfGEN = dfGEN[["class", "tweet"]]

# Define base 200 rows
dfBase200 = dfGEN[0:200]

# Shuffle the remainder for better distribution
dfGEN = shuffle(dfGEN[200:]).reset_index(drop=True)


In [162]:
dfHate = dfOriginal[dfOriginal['class']==0].reset_index(drop=True)
dfHate

Unnamed: 0,class,tweet
0,0,"queer"" gaywad"
1,0,"alsarabsss"" hes a beaner smh you can tell hes ..."
2,0,"you're fucking gay, blacklisted hoe"" Holding o..."
3,0,LMFAOOOO I HATE BLACK PEOPLE This is why there...
4,0,"""At least I'm not a nigger"" Lmfao"
...,...,...
1425,0,this guy is the biggest faggot omfg
1426,0,which one of these names is more offensive kik...
1427,0,you a pussy ass nigga and I know it nigga.
1428,0,you're all niggers


In [108]:
dfNeither = dfOriginal[dfOriginal['class']==2].reset_index(drop=True)
dfNeither

Unnamed: 0,class,tweet
0,2,!!! As a woman you shouldn't complain about cl...
1,2,""" momma said no pussy cats inside my doghouse """
2,2,-SimplyAddictedToGuys woof woof hot scally lad
3,2,woof woof and hot soles
4,2,"Lemmie eat a Oreo do these dishes."" One oreo? Lol"
...,...,...
4158,2,yaya ho.. cute avi tho I had no idea she was s...
4159,2,yea so about 's new friend.. all my friends kn...
4160,2,"you know what they say, the early bird gets th..."
4161,2,"you've gone and broke the wrong heart baby, an..."


In [109]:
dfBase200

Unnamed: 0,class,tweet
0,2,my pops is from New York. My family is Yankee ...
1,2,Thanks Carolina fans the flipped birds+few bot...
2,2,Far right hero and all-around RWNJ D'Souza too...
3,2,When your phone goes off in class and the teac...
4,0,I saw a great chimpout summers ago. A middle a...
...,...,...
195,2,LMAO Joe was calling whitehouse offering advic...
196,2,online soon? I didn't have enough to get it af...
197,0,"lmao, soft ass nigga trying to act all tough, ..."
198,0,"Bro everybody is a ho then, fuck it, your favo..."


In [110]:
dfGEN

Unnamed: 0,class,tweet
0,2,"r as the girl who wears the hoodies."" The foll..."
1,2,"just came out.Well that is my question,But I ..."
2,2,However people with a great observational ski...
3,2,I had to do some more look at this.I am very ...
4,2,"Any thoughts?(also, why do we even send the w..."
...,...,...
1796,2,I love to believe it.There was no shortage of...
1797,2,to watch the Yankees lose to the Marlins a ga...
1798,2,All should come with the right documentation....
1799,2,A local yokel went out of his way to get my a...


In [167]:
# Distribution of the classes --> Uncomment your line to run tests

# TEST DATA
# --------------------------------------------------
# Original
# testData = df[df['class']!=1][3000:3200]

# Preprocessed
testData = dfOriginal[3000:3200]

# frames = [dfHate[1300:1400], dfNeither[3800:3900]]
# testData = pd.concat(frames)


# TRAINING DATA
# --------------------------------------------------
# Uncomment your line

# 200
# Only use base 200 - no change needed

# 500
# dfGEN = dfGEN[dfGEN['class']!=1][0:300]

# 1000
# dfGEN = dfGEN[dfGEN['class']!=1][0:800]

# 2000
dfGEN = dfGEN[dfGEN['class']!=1][0:1800]

# Reset index
dfGEN = dfGEN.reset_index(drop=True)
dfGEN

testData

Unnamed: 0,class,tweet
3000,2,"Mace For Your Wife, Charlie Brown?"
3001,2,Mad respect to Affleck for refusing to wear a ...
3002,2,Madea and this old hillbilly are hilarious tog...
3003,2,Magellan? You suck at trash talk.
3004,2,Majority of it is trash anyway.
...,...,...
3195,2,On the secret Party mission! Live tweaking to ...
3196,2,Once the brainwork is done on pacemakers at me...
3197,2,One man's trash is another's Transformer
3198,2,One mans trash is another mans treasure


In [168]:
# Combining the original 200 + the generated data

frames = [dfBase200, dfGEN]
df_concat = pd.concat(frames)


# df_concat = df_concat.drop(columns=['index'])
df_concat = df_concat.reset_index(drop=True)
# df_concat = df_concat.rename(columns = {"level_0":"index"})


# # Fix any NaN values
df_concat["tweet"].fillna("", inplace = True)

df_concat

Unnamed: 0,class,tweet
0,2,my pops is from New York. My family is Yankee ...
1,2,Thanks Carolina fans the flipped birds+few bot...
2,2,Far right hero and all-around RWNJ D'Souza too...
3,2,When your phone goes off in class and the teac...
4,0,I saw a great chimpout summers ago. A middle a...
...,...,...
1995,2,"3-7;8-33, 3.0-rpg, .977 batting average and 4 ..."
1996,2,- After you have killed two birdsDukePair 4: ...
1997,2,II. Enjoy the full cover and all of the artwo...
1998,2,"The fact is this won't be a coincidence. Cruz,..."


In [169]:
# Divide data as Test/Train set

# TEST SET
# --------------------------------------------------
test_X = testData['tweet']
test_Y = testData['class']


# TRAIN SET
# --------------------------------------------------
# Baseline --> Uncomment to run baseline experiment
# train_X = dfOriginal['tweet'][0:2000]
# train_Y = dfOriginal['class'][0:2000]

# Generated data added --> Uncomment your lines to run
# Training Size = 200
# train_X = dfBase200['tweet'][0:200]
# train_Y = dfBase200['class'][0:200]

# Training Size = 500
# train_X = df_concat['tweet'][0:500]
# train_Y = df_concat['class'][0:500]

# Training Size = 1000
# train_X = df_concat['tweet'][0:1000]
# train_Y = df_concat['class'][0:1000]



# Training Size = 2000
train_X = df_concat['tweet'][0:2000]
train_Y = df_concat['class'][0:2000]


# train_X = df_concat['tweet']
# train_Y = df_concat['class']

In [156]:
train_Y

0      2
1      2
2      2
3      2
4      0
      ..
995    2
996    2
997    2
998    2
999    0
Name: class, Length: 1000, dtype: int64

In [170]:
# Label Encoding

Encoder = LabelEncoder()
train_Y = Encoder.fit_transform(train_Y)
test_Y = Encoder.fit_transform(test_Y)


In [171]:
# Word Vectorization

Tfidf_vector = TfidfVectorizer(max_features=5000)

# BASELINE
# Tfidf_vector.fit(dfOriginal['tweet'])

# GENERATED
Tfidf_vector.fit(df_concat['tweet'])

train_X_tfidf = Tfidf_vector.transform(train_X)
test_X_tfidf = Tfidf_vector.transform(test_X)

## Using SVM Classifier

In [172]:
# Create a classifier
clf = svm.SVC()

# Train the model
clf.fit(train_X_tfidf, train_Y)

# Predict labels on test data
predictions = clf.predict(test_X_tfidf)

# Get the Precision, Recall, F1
print("SVM Precision Score -> ", average_precision_score(test_Y, predictions))
print("SVM Recall Score -> ", recall_score(test_Y, predictions, average='macro'))
print("SVM F1 Score -> ", f1_score(test_Y, predictions, average='macro'))

SVM Precision Score ->  0.8144329896907216
SVM Recall Score ->  0.5714285714285714
SVM F1 Score ->  0.5738636363636364


In [173]:
num_rows = 0
#rowArr = []
for row_index, (input, prediction, label) in enumerate(zip (test_X, predictions, test_Y)):
  if prediction != label:
    num_rows += 1
    #print('Row', row_index, 'has been classified as ', prediction, 'and should be ', label)
    
print(num_rows)

36


In [None]:
import csv
testData.reset_index(inplace=True, drop=True)
testData.to_csv('testData2.csv')
testData

Unnamed: 0,class,tweet
0,2,"Mace For Your Wife, Charlie Brown?"
1,2,Mad respect to Affleck for refusing to wear a ...
2,2,Madea and this old hillbilly are hilarious tog...
3,2,Magellan? You suck at trash talk.
4,2,Majority of it is trash anyway.
...,...,...
195,2,On the secret Party mission! Live tweaking to ...
196,2,Once the brainwork is done on pacemakers at me...
197,2,One man's trash is another's Transformer
198,2,One mans trash is another mans treasure


# GPT2

## Imports

In [None]:
! pip install tensorflow
! pip install transformers
from transformers import pipeline, set_seed



## Text Generation

In [None]:
#Initialize pipeline
generator = pipeline('text-generation', model='gpt2')

#Test case using generator
generator("Hello, I am a language model,", max_length=30, num_return_sequences=5)

generator("Hello, I am a language model,", max_length=60, num_return_sequences=5)

generator(df['tweet'][120], max_length=60, num_return_sequences=5)

# CODE FROM ABOVE FOR REFERENCE
# Hate Speech (200 Records out of 19,190 Total)
# hs_df = df[df['class']==0][0:200]

# Offensive Language (200 Records out of 2,163 Total)
# ol_df = df[df['class']==1][0:200]

# Neither (200 Records out of 1,430 Total)
# neither_df = df[df['class']==2][0:200]

# *** Note: The indeces correspond to the row number from the original data (df)
#           and not the "index" of the row in the new data set

# Generate a hate speech tweet (uses first row):
# generator(hs_df['tweet'][85], max_length=60, num_return_sequences=5)

# Generate an offensive language tweet  (uses first row):
# generator(ol_df['tweet'][0], max_length=60, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '"@El_Grillo1: Pit Bulls Photographed As Lovely Fairy Tale Creatures http://t.co/Q0Sm89oOLh&#8221;\n\nThey *are* fairy tale creatures. pic.twitter.com/ZWcLxY0MQ'},
 {'generated_text': '"@El_Grillo1: Pit Bulls Photographed As Lovely Fairy Tale Creatures http://t.co/Q0Sm89oOLh&#8221;\n\nThey *are* fairy tale creatures. pic.twitter.com/vYhwkM0Y9'},
 {'generated_text': '"@El_Grillo1: Pit Bulls Photographed As Lovely Fairy Tale Creatures http://t.co/Q0Sm89oOLh&#8221;\n\nThey *are* fairy tale creatures.\n\n\nThey seem just as charming as those cats & dogs.\n\n\n'},
 {'generated_text': '"@El_Grillo1: Pit Bulls Photographed As Lovely Fairy Tale Creatures http://t.co/Q0Sm89oOLh&#8221;\n\nThey *are* fairy tale creatures. Their whole body is covered! #wtf1\n\n@Bryan'},
 {'generated_text': '"@El_Grillo1: Pit Bulls Photographed As Lovely Fairy Tale Creatures http://t.co/Q0Sm89oOLh&#8221;\n\nThey *are* fairy tale creatures. That\'s some bullshit that\'s so incredibly important. T

In [None]:
hs_df = df[df['class']==0][0:5]
hs_df = hs_df.reset_index()

In [None]:
res_df = generate_data(hs_df, 10)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
res_df

Unnamed: 0,index,class,tweet
0,85,0,"""@Blackman38Tide: @WhaleLookyHere @HowdyDowdy1..."
1,89,0,"""@CB_Baby24: @white_thunduh alsarabsss"" hes a ..."
2,110,0,"""@DevilGrimz: @VigxRArts you're fucking gay, b..."
3,184,0,"""@MarkRoundtreeJr: LMFAOOOO I HATE BLACK PEOPL..."
4,202,0,"""@NoChillPaz: ""At least I'm not a nigger"" http..."
5,0,0,"@gmail.com/contact/comments\n\n[25-Feb-2009, 0..."
6,1,0,adz.com\n\n\nHere's a link to my website: http...
7,2,0,#mwb @DawgBurningBlackWomen #BlackWomenWomens...
8,3,0,"""@CharityMate2 @HairyBabies, @Migaliskr2"" @pi..."
9,4,0,: https://twitter.com/sjhullech/status/7636642...


# function to generate GPT2 text
### Elliot

In [None]:
# this method takes a dataframe and generates data based on it
def generate_data(df, rows_to_generate=200):
  # make an empty copy of data frame
  # new_df = pd.DataFrame(columns=df.columns)
  generator = pipeline('text-generation', model='gpt2')
  total_row_num = len(df.index)
  cur_row = 0
  # reset the row number 
  for i in range(rows_to_generate):
    if cur_row >= total_row_num:
      cur_row = 0
    # print("original: \n")
    # print(hs_df['tweet'][i])
    class_val = hs_df['class'][cur_row]
    length = len(hs_df['tweet'][cur_row])
    gen = generator(hs_df['tweet'][cur_row], max_length=(length*2), num_return_sequences=1)
    long_tweet = gen[0]['generated_text']
    # get rid of original tweet
    new_tweet = long_tweet[length:]
    # print("new: \n")
    # print(new_tweet)
    new_row = {'index': i, 'class':class_val, 'tweet':new_tweet}
    df = df.append(new_row, ignore_index=True)
    # print(df)

  return df

# BERT

### Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import transformers as ppb
import torch
import warnings
warnings.filterwarnings('ignore')

### Loading Pre-Trained BERT Model

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

### Tokenization

In [None]:
tokenized = df['tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

### Padding

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

### Masking

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(24783, 481)

### Running the Model

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# Old SVM cells

In [None]:
# ORIGINAL DATA SET
# -------------------------------------------------------------------------------------------------------------------
# Get original data frame
#df = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/labeled_data_trunc.csv")

# Get original data with preprocessing
# dfPRE = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/originalPreProcess.csv")

# GENERATED DATA SET
# -------------------------------------------------------------------------------------------------------------------
# Get noProcess generated data frame
dfGEN = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/genData200_1800.csv")
dfBase200 = dfGEN[0:200]
dfGEN = dfGEN[200:]
dfGEN = dfGEN.drop(columns=['index'])
dfGEN = dfGEN.reset_index()
dfGEN = dfGEN.rename(columns = {"level_0":"index"})
dfGEN = shuffle(dfGEN).reset_index(drop=True)


#Get preProcessed data
# dfPRE = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/Datasets/generated_data_with_long_tweets_2000.csv")
# dfPRE = shuffle(dfGEN[200:]).reset_index(drop=True)

# #Get pre-postProcessed data
# dfPOST = pd.read_csv("/content/drive/Shareddrives/CS522 term project - bullying hatespeech/Datasets/generated_data_with_long_tweets_2000.csv")
# dfPOST = dfPOST[["class", "tweet"]]

# dfBase200 = dfPOST[0:200]
# dfPOST = shuffle(dfPOST[200:]).reset_index(drop=True)