# Import Basic Libraries

In [1]:
import sklearn
import numpy as np
import pandas as pd

# Import Data

In [21]:
# training data
train = pd.read_csv("cyberbullying_tweets.csv")

# test data
test = pd.read_csv("./content/test.csv")

# Data Exploration (Exploratory Data Analysis)

In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [6]:
# non-racist/sexist related tweets
sum(train["label"] == 0)

29720

In [7]:
# racist/sexist related tweets
sum(train["label"] == 1)

2242

In [8]:
# check if there are any missing values
train.isnull().sum()
#train.isnull().values.any()

id       0
label    0
tweet    0
dtype: int64

# Data cleaning

In [0]:
#install tweet-preprocessor to clean tweets
!pip install tweet-preprocessor



In [9]:
# remove special characters using the regular expression library
import re

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [10]:
import preprocessor as p

# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

In [11]:
# clean training data
train_tweet = clean_tweets(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [13]:
# append cleaned tweets to the training data
train["clean_tweet"] = train_tweet

# compare the cleaned and uncleaned tweets
train.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
5,6,0,[2/2] huge fan fare and big talking before the...,2 2 huge fan fare and big talking before they ...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so


In [14]:
# clean the test data and append the cleaned tweets to the test data
test_tweet = clean_tweets(test["tweet"])
test_tweet = pd.DataFrame(test_tweet)
# append cleaned tweets to the training data
test["clean_tweet"] = test_tweet

# compare the cleaned and uncleaned tweets
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


In [25]:
import pickle as pkl
import pycld2

def sample_encode(df):

    df = df.loc[df['tweet_text'].apply(lambda x: safe_detect(x) == 'en')]

    # Sample 3,740 observation for each category
    sample_religion = df.loc[df['cyberbullying_type']=='religion'].sample\
        (n=3450, random_state=42)

    sample_age = df.loc[df['cyberbullying_type']=='age'].sample\
        (n=3450, random_state=42)

    sample_other_cyberbullying = df.loc[df['cyberbullying_type']=='other_cyberbullying'].sample\
        (n=3450, random_state=42)

    sample_ethnicity = df.loc[df['cyberbullying_type']=='ethnicity'].sample\
        (n=3450, random_state=42)

    sample_gender = df.loc[df['cyberbullying_type']=='gender'].sample\
        (n=3450, random_state=42)

    samples_df = pd.concat([sample_religion,sample_age,
                            sample_other_cyberbullying,sample_ethnicity,sample_gender])

    # Manually encode the different types of cyberbullying/non-cyberbullying
    samples_df.loc[:, 'cyberbullying_type'] = 1

    df.loc[df['cyberbullying_type'] == 'not_cyberbullying', 'cyberbullying_type'] = 0
    non_cyberbullying_df = df.loc[df['cyberbullying_type']==0].copy()

    # Ensure target column is of type int
    non_cyberbullying_df['cyberbullying_type'] = non_cyberbullying_df['cyberbullying_type']\
        .astype(int)
    samples_df['cyberbullying_type'] = samples_df['cyberbullying_type'].astype(int)

    df = pd.concat([non_cyberbullying_df,samples_df])

    # Check Unique y values
    y_values = df['cyberbullying_type'].value_counts()
    print(f'Check y values and counts \n {y_values}')

    return df

###Intergrated code from project
def safe_detect(text):
    try:
        return pycld2.detect(text)[2][0][1]
    except Exception:
        return 'unknown'

def load_pkl_data(path):
    with open(path, 'rb') as f:
        data = pkl.load(f)
    return data

def preprocess_data(df):

    # Check for missing values
    missing_values = df.isna().sum()
    print(f'Missing Values:\n{missing_values}')

    # Find all tweets that are less than 3 words and remove them
    df['tweet_length'] = [len(text.split()) for text in df.tweet_text]
    df = df.loc[df['tweet_length'] >= 3]
    df.drop('tweet_length', axis=1, inplace=True)

    # Clean the text by removing special characters and converting all text to lower case
    df['tweet_text'] = df['tweet_text'].apply(clean_tweets)

    return df

def driver():
    data = load_pkl_data('./data 2/formspring_data.pkl')
    formspring_df = pd.DataFrame(data)

    # Check the raw formspring data
    print(f'Check raw formspring_df: {formspring_df.head(5)}')
    formspring_df.rename(columns={'text':'tweet_text', 'label':
        'cyberbullying_type'}, inplace=True)
    formspring_df = preprocess_data(formspring_df)

    # Remove any non english entries
    formspring_df = formspring_df.loc[formspring_df['tweet_text'].apply(lambda x: safe_detect(x) == 'en')]

    # Check cleaned formspring data
    print(f'Check cleaned formspring_df: {formspring_df.head(5)}')

    df = pd.read_csv("cyberbullying_tweets.csv")

    # Check raw cyberbullying tweet data
    print(f'Check raw cyberbullying tweet data: {df.head(5)}')

    df = preprocess_data(df)

    # Check cleaned cyberbullying tweet data
    print(f'Check cleaned cyberbullying tweet data: {df.head(5)}')
    df = sample_encode(df)

    # Merge both dataframes for data balancing
    merged_df = pd.concat([df, formspring_df])

    # Check merged dataframe data
    # print(f'Check merged dataframe {merged_df.head(10)}')
    count_cyberbullying = len(merged_df[merged_df['cyberbullying_type'] == 1])
    count_non_cyberbullying = len\
        (merged_df[merged_df['cyberbullying_type'] == 0])

    print(f'Check here {count_cyberbullying}')
    print(f'Check here {count_non_cyberbullying}')

    return merged_df

# Test and Train split


In [None]:
from sklearn.model_selection import train_test_split

merged_df = driver()
# extract the labels from the train data
# y = train.label.values
y = merged_df.cyberbullying_type.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(merged_df.tweet_text.values,
                                                    y,
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

Check raw formspring_df:                                                 text  label
0  what is your favorite song ? d i like too many...      0
1                                 3 3 ? haha jk ! 33      0
2  hey angel you duh sexy really ? ! ? ! thanks ?...      0
3                                                         0
4                                      meowww rawr ?      0
Missing Values:
tweet_text            0
cyberbullying_type    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet_text'] = df['tweet_text'].apply(clean_tweets)


Check cleaned formspring_df: Empty DataFrame
Columns: [tweet_text, cyberbullying_type]
Index: []
Check raw cyberbullying tweet data:                                           tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying
Missing Values:
tweet_text            0
cyberbullying_type    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


# Vectorize tweets using CountVectorizer

CountVectorizer Example

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
documents = ["This is Import Data's Youtube channel",
             "Data science is my passion and it is fun!",
             "Please subscribe to my channel"]

# initializing the countvectorizer
vectorizer = CountVectorizer()

# tokenize and make the document into a matrix
document_term_matrix = vectorizer.fit_transform(documents)

# check the result
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,and,channel,data,fun,import,is,it,my,passion,please,science,subscribe,this,to,youtube
0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,1
1,1,0,1,1,0,2,1,1,1,0,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

# Model building

Apply Support Vetor Classifier (SVC)

In [19]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

# Accuracy score for SVC


In [20]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVC is:  94.86912086766085 %
