In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import re


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("cyberbullying_tweets.csv")
my_tags = ['religion','age','gender','ethnicity','not_cyberbullying','other_cyberbullying']

In [3]:
df

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity


In [5]:
col = ['cyberbullying_type', 'tweet_text']
df = df[col]
df = df[pd.notnull(df['tweet_text'])]

In [6]:
df.columns = ['cyberbullying_type', 'tweet_text']

In [7]:
df['category_id'] = df['cyberbullying_type'].factorize()[0]

In [11]:
df.head()

Unnamed: 0,cyberbullying_type,tweet_text,category_id
0,not_cyberbullying,"In other words #katandandre, your food was cra...",0
1,not_cyberbullying,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,not_cyberbullying,@XochitlSuckkks a classy whore? Or more red ve...,0
3,not_cyberbullying,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,not_cyberbullying,@RudhoeEnglish This is an ISIS account pretend...,0


(47692, 19580)

In [10]:
df.head()

Unnamed: 0,cyberbullying_type,tweet_text,category_id
0,not_cyberbullying,"In other words #katandandre, your food was cra...",0
1,not_cyberbullying,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,not_cyberbullying,@XochitlSuckkks a classy whore? Or more red ve...,0
3,not_cyberbullying,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,not_cyberbullying,@RudhoeEnglish This is an ISIS account pretend...,0


In [12]:
category_id_df = df[['cyberbullying_type', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'cyberbullying_type']].values)
df.head()

Unnamed: 0,cyberbullying_type,tweet_text,category_id
0,not_cyberbullying,"In other words #katandandre, your food was cra...",0
1,not_cyberbullying,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,not_cyberbullying,@XochitlSuckkks a classy whore? Or more red ve...,0
3,not_cyberbullying,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,not_cyberbullying,@RudhoeEnglish This is an ISIS account pretend...,0


In [13]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.tweet_text).toarray()
labels = df.category_id
features.shape

(47692, 19580)

In [15]:
N = 2
for cyberbullying_type, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(cyberbullying_type))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))



# 'age':
  . Most correlated unigrams:
. high
. school
  . Most correlated bigrams:
. bullied high
. high school




# 'ethnicity':
  . Most correlated unigrams:
. nigger
. dumb
  . Most correlated bigrams:
. ass nigger
. dumb ass




# 'gender':
  . Most correlated unigrams:
. gay
. rape
  . Most correlated bigrams:
. rape joke
. rape jokes




# 'not_cyberbullying':
  . Most correlated unigrams:
. bullying
. mkr
  . Most correlated bigrams:
. high school
. kat andre




# 'other_cyberbullying':
  . Most correlated unigrams:
. school
. https
  . Most correlated bigrams:
. dumb ass
. high school
# 'religion':
  . Most correlated unigrams:
. muslim
. muslims
  . Most correlated bigrams:
. christian woman
. islamic terrorism




In [19]:
xTrain, xTest, yTrain, yTest = train_test_split(df['tweet_text'], df['cyberbullying_type'], test_size = 0.2, random_state = 42)
count_vect = CountVectorizer()
xTrain_counts = count_vect.fit_transform(xTrain)
tfidf_transformer = TfidfTransformer()
xTrain_tfidf = tfidf_transformer.fit_transform(xTrain_counts)
clf = MultinomialNB().fit(xTrain_tfidf, yTrain)