In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Import Libraries**

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

# **Pre-processing**

In [4]:
import gensim
import re
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.stem.porter import *
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from string import punctuation
np.random.seed(2018)

In [5]:
def transformed_data(text): 
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    tokens = word_tokenize(text)
    cleaned_tokens = []

    for tok, tag in pos_tag(tokens):
        tok = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\)]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tok)
        tok = re.sub("(@[A-Za-z0-9_]+)","", tok)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        if len(tok) > 3 and tok not in punctuation and tok.lower() not in gensim.parsing.preprocessing.STOPWORDS:
          tok = stemmer.stem(WordNetLemmatizer().lemmatize(tok, pos))
          cleaned_tokens.append(tok.lower())
          
    return cleaned_tokens

# **Mendeley Dataset**

In [6]:
aggr = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/aggression_parsed_dataset.csv')
kaggle = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/kaggle_parsed_dataset.csv')
toxicity= pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/toxicity_parsed_dataset.csv')
twitter= pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/twitter_parsed_dataset.csv')
twitter_racism = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/twitter_racism_parsed_dataset.csv')
twitter_sexism = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/twitter_sexism_parsed_dataset.csv')
youtube = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/youtube_parsed_dataset.csv')
pan12 = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/pan12_chatlog.csv')

In [7]:
datasets = {'Aggression': aggr, 
            'Kaggle': kaggle, 
            'Toxicity' : toxicity, 
            'Twitter' : twitter, 
            'Twitter_Racism' : twitter_racism, 
            'Twitter_Sexism' : twitter_sexism, 
            'Youtube' : youtube, 
            'PAN12' : pan12}

In [8]:
for i in datasets:
  print(f'{i}: \n{datasets[i].columns}\n\n')

Aggression: 
Index(['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label'], dtype='object')


Kaggle: 
Index(['index', 'oh_label', 'Date', 'Text'], dtype='object')


Toxicity: 
Index(['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label'], dtype='object')


Twitter: 
Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='object')


Twitter_Racism: 
Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='object')


Twitter_Sexism: 
Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='object')


Youtube: 
Index(['index', 'UserIndex', 'Text', 'Number of Comments',
       'Number of Subscribers', 'Membership Duration', 'Number of Uploads',
       'Profanity in UserID', 'Age', 'oh_label'],
      dtype='object')


PAN12: 
Index(['Author ID', 'OG_Text', 'Clean_Text', 'Preprocessed_Text', 'Sentiment',
       'Compound Score', 'Positive Score', 'Negative Score', 'Neutral Score'],
      dtype='object')




In [9]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [10]:
import spacy
import preprocessor as p

#data cleaning
def cleaning(s):
  text = s
  #find all hashtags    
  hashtag=re.findall(r'#(\w+)', text)
  #remove hashtags, URL, emojis, mention, number, etc 
  clean=p.clean(text)
  return clean

###### Aggression

In [11]:
aggr.head()

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label
0,0,`- This is not ``creative``. Those are the di...,1.0,0.0,0
1,1,` :: the term ``standard model`` is itself le...,1.0,0.0,0
2,2,"True or false, the situation as of March 200...",1.0,0.0,0
3,3,"Next, maybe you could work on being less cond...",0.555556,0.444444,0
4,4,This page will need disambiguation.,1.0,0.0,0


In [12]:
aggr = aggr.dropna()

In [13]:
label = {0:'None', 1:'Aggression'}
aggr['Bully'] = aggr['oh_label'].apply(lambda x: label[x])

In [14]:
aggr = aggr.drop(['index', 'ed_label_0', 'ed_label_1', 'oh_label'], axis = 1)

In [15]:
aggr.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [16]:
aggr['Clean_Text'] = aggr['OG_Text'].apply(lambda x: cleaning(x))

In [17]:
aggr['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in aggr.Clean_Text]
aggr.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,`- This is not ``creative``. Those are the di...,,`- This is not ``creative``. Those are the dic...,creativ dictionari definit term insur ensur pr...
1,` :: the term ``standard model`` is itself le...,,` :: the term ``standard model`` is itself les...,term standard model npov think prefer new-ag s...
2,"True or false, the situation as of March 200...",,"True or false, the situation as of March was s...",true fals situat march saudi propos land peac ...
3,"Next, maybe you could work on being less cond...",,"Next, maybe you could work on being less conde...",mayb work condescend suggest read name convent...
4,This page will need disambiguation.,,This page will need disambiguation.,page need disambigu


In [18]:
aggr.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### Kaggle

In [19]:
kaggle.head()

Unnamed: 0,index,oh_label,Date,Text
0,0,1,20120618192155Z,"""You fuck your dad."""
1,1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,3,0,,"""listen if you dont wanna get married to a man..."
4,4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [20]:
kaggle = kaggle.dropna()

In [21]:
label = {0:'None', 1:'Misconduct'}
kaggle['Bully'] = kaggle['oh_label'].apply(lambda x: label[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
kaggle = kaggle.drop(['index', 'Date', 'oh_label'], axis = 1)

In [23]:
kaggle.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [24]:
kaggle['Clean_Text'] = kaggle['OG_Text'].apply(lambda x: cleaning(x))

In [25]:
kaggle['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in kaggle.Clean_Text]
kaggle.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,"""You fuck your dad.""",Misconduct,"""You fuck your dad.""",fuck
1,"""i really don't understand your point.\xa0 It ...",,"""i really don't understand your point.\xa0 It ...",understand point.\xa0 mix appl orang
4,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",c\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1eddn...
5,"""@SDL OK, but I would hope they'd sign him to ...",,""" OK, but I would hope they'd sign him to a on...",hope sign one-year contract start chanc reliab...
6,"""Yeah and where are you now?""",,"""Yeah and where are you now?""",yeah


In [26]:
kaggle.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### Toxicity

In [27]:
toxicity.head()

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label
0,0,This: :One can make an analogy in mathematical...,0.9,0.1,0
1,1,` :Clarification for you (and Zundark's righ...,1.0,0.0,0
2,2,Elected or Electoral? JHK,1.0,0.0,0
3,3,`This is such a fun entry. Devotchka I once...,1.0,0.0,0
4,4,Please relate the ozone hole to increases in c...,0.8,0.2,0


In [28]:
toxicity = toxicity.dropna()

In [29]:
label = {0:'None', 1:'Toxic'}
toxicity['Bully'] = toxicity['oh_label'].apply(lambda x: label[x])

In [30]:
toxicity = toxicity.drop(['index', 'ed_label_0', 'ed_label_1', 'oh_label'], axis = 1)

In [31]:
toxicity.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [32]:
toxicity['Clean_Text'] = toxicity['OG_Text'].apply(lambda x: cleaning(x))

In [33]:
toxicity['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in toxicity.Clean_Text]
toxicity.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,This: :One can make an analogy in mathematical...,,This: ne can make an analogy in mathematical t...,analog mathemat term envis distribut opinion p...
1,` :Clarification for you (and Zundark's righ...,,"` :Clarification for you (and Zundark's right,...",clarif zundark right check wikipedia bug page ...
2,Elected or Electoral? JHK,,Elected or Electoral? JHK,elect elector
3,`This is such a fun entry. Devotchka I once...,,`This is such a fun entry. Devotchka I once ha...,entri devotchka cowork korea tell differ usa-e...
4,Please relate the ozone hole to increases in c...,,Please relate the ozone hole to increases in c...,relat ozon hole increas cancer provid figur ar...


In [34]:
toxicity.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### Twitter

In [35]:
twitter.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0


In [36]:
twitter = twitter.dropna()

In [37]:
label = {'none':'None', 'sexism':'Sexism', 'racism': 'Racism'}
twitter['Bully'] = twitter['Annotation'].apply(lambda x: label[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
twitter = twitter.drop(['index', 'id', 'Annotation', 'oh_label'], axis = 1)

In [39]:
twitter.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [40]:
twitter['Clean_Text'] = twitter['OG_Text'].apply(lambda x: cleaning(x))

In [41]:
twitter['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in twitter.Clean_Text]
twitter.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,@halalflaws @biebervalue @greenlinerzjm I read...,,I read them in context.No change in meaning. T...,read context.no chang mean histori islam slaveri
1,@ShreyaBafna3 Now you idiots claim that people...,,Now you idiots claim that people who tried to ...,idiot claim peopl tri stop terrorist terrorist...
2,"RT @Mooseoftorment Call me sexist, but when I ...",Sexism,"Call me sexist, but when I go to an auto place...",sexist auto place talk
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",Racism,"Wrong, ISIS follows the example of Mohammed an...",wrong isi follow exampl moham quran exact
4,#mkr No No No No No No,,No No No No No No,


In [42]:
twitter.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### Twitter Racism 

In [43]:
twitter_racism.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.767493e+17,5.767493e+17,@AAlwuhaib1977 Muslim mob violence against Hin...,racism,1
1,5.408905e+17,5.408905e+17,@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG,none,0
2,5.678433e+17,5.678433e+17,@jncatron @isra_jourisra @AMPalestine Islamoph...,racism,1
3,5.766462e+17,5.766462e+17,"Finally I'm all caught up, and that sudden dea...",none,0
4,5.713492e+17,5.713492e+17,@carolinesinders @herecomesfran *hugs*,none,0


In [44]:
twitter_racism = twitter_racism.dropna()

In [45]:
label = {'none':'None', 'racism':'Racism'}
twitter_racism['Bully'] = twitter_racism['Annotation'].apply(lambda x: label[x])

In [46]:
twitter_racism = twitter_racism.drop(['index', 'id', 'Annotation', 'oh_label'], axis = 1)

In [47]:
twitter_racism.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [48]:
twitter_racism['Clean_Text'] = twitter_racism['OG_Text'].apply(lambda x: cleaning(x))

In [49]:
twitter_racism['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in twitter_racism.Clean_Text]
twitter_racism.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,@AAlwuhaib1977 Muslim mob violence against Hin...,Racism,Muslim mob violence against Hindus in Banglade...,muslim violenc hindus bangladesh continu
1,@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG,,,
2,@jncatron @isra_jourisra @AMPalestine Islamoph...,Racism,Islamophobia is like the idea of Naziphobia. I...,islamophobia like idea naziphobia islam religi...
3,"Finally I'm all caught up, and that sudden dea...",,"Finally I'm all caught up, and that sudden dea...",final catch sudden death cook look like intens
4,@carolinesinders @herecomesfran *hugs*,,*hugs*,hug


In [50]:
twitter_racism.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### Twitter Sexism

In [51]:
twitter_sexism.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.35198627292254e+17,5.35198627292254e+17,RT @BeepsS: @senna1 @BeepsS: I'm not sexist bu...,sexism,1.0
1,5.75984924030714e+17,5.75984924030714e+17,There's some very hate able teams this year #MKR,none,0.0
2,5.7233536016588e+17,5.7233536016588e+17,"RT @The_Eccles: ""Everyone underestimated us"" \...",none,0.0
3,5.72337925708374e+17,5.72337925708374e+17,RT @NOTLukeDarcy: did @Channel7 or #MKR actual...,none,0.0
4,4.43033024528011e+17,4.43033024528011e+17,"No, you don't. @Shut_Up_Jeff: I thought of a r...",sexism,1.0


In [52]:
twitter_sexism = twitter_sexism.dropna()

In [53]:
label = {'none':'None', 'sexism':'Sexism'}
twitter_sexism['Bully'] = twitter_sexism['Annotation'].apply(lambda x: label[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [54]:
twitter_sexism = twitter_sexism.drop(['index', 'id', 'Annotation', 'oh_label'], axis = 1)

In [55]:
twitter_sexism.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [56]:
twitter_sexism['Clean_Text'] = twitter_sexism['OG_Text'].apply(lambda x: cleaning(x))

In [57]:
twitter_sexism['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in twitter_sexism.Clean_Text]
twitter_sexism.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,RT @BeepsS: @senna1 @BeepsS: I'm not sexist bu...,Sexism,: : I'm not sexist but fuck if you're a woman ...,sexist fuck woman cook shit
1,There's some very hate able teams this year #MKR,,There's some very hate able teams this year,hate abl team year
2,"RT @The_Eccles: ""Everyone underestimated us"" \...",,": ""Everyone underestimated us"" We still do, as...",underestim underestim judg
3,RT @NOTLukeDarcy: did @Channel7 or #MKR actual...,,: did or actually check if any of these people...,actual check peopl cook
4,"No, you don't. @Shut_Up_Jeff: I thought of a r...",Sexism,"No, you don't. : I thought of a really funny j...",think funni joke promis sexist


In [58]:
twitter_sexism.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### YouTube

In [59]:
youtube.head()

Unnamed: 0,index,UserIndex,Text,Number of Comments,Number of Subscribers,Membership Duration,Number of Uploads,Profanity in UserID,Age,oh_label
0,0,X1,Does N.e.bodyelse Hear her Crazy ass Screamin ...,10,1,3,3,0,15,0
1,1,X2,There are so many things that are incorrect wi...,3,0,6,5,0,31,0
2,2,X3,3:26 hahah my boyfriend showed this song to me...,7,0,3,5,0,43,1
3,3,X2218,dick beyonce fuck y a ass hole you are truely ...,34,0,3,5,0,44,1
4,4,X5,DongHaeTaemin and Kai ;A; luhansehun and bacon...,11,173,5,5,0,21,0


In [60]:
youtube = youtube.dropna()

In [61]:
label = {0:'None', 1:'Misconduct'}
youtube['Bully'] = youtube['oh_label'].apply(lambda x: label[x])

In [62]:
youtube = youtube.drop(['index', 'UserIndex', 'Number of Comments', 'Number of Subscribers', 'Membership Duration', 
                        'Number of Uploads', 'Profanity in UserID', 'Age', 'oh_label'], axis = 1)

In [63]:
youtube.rename(columns = {'Text':'OG_Text'}, inplace = True)

In [64]:
youtube['Clean_Text'] = youtube['OG_Text'].apply(lambda x: cleaning(x))

In [65]:
youtube['Preprocessed_Text'] = [' '.join(transformed_data(i)) for i in youtube.Clean_Text]
youtube.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,Does N.e.bodyelse Hear her Crazy ass Screamin ...,,Does N.e.bodyelse Hear her Crazy ass Screamin ...,n.e.bodyels hear crazi screamin everytim stupi...
1,There are so many things that are incorrect wi...,,There are so many things that are incorrect wi...,thing incorrect comment unbeliev gun kill peop...
2,3:26 hahah my boyfriend showed this song to me...,Misconduct,:26 hahah my boyfriend showed this song to me ...,hahah boyfriend show song love tooo have seizu...
3,dick beyonce fuck y a ass hole you are truely ...,Misconduct,dick beyonce fuck y a ass hole you are truely ...,dick beyonc fuck hole trueli bitch look like d...
4,DongHaeTaemin and Kai ;A; luhansehun and bacon...,,DongHaeTaemin and Kai ;A; luhansehun and bacon...,donghaetaemin luhansehun bacon taemindongha do...


In [66]:
youtube.columns

Index(['OG_Text', 'Bully', 'Clean_Text', 'Preprocessed_Text'], dtype='object')

###### PAN12

In [67]:
pan12.head()

Unnamed: 0,Author ID,OG_Text,Clean_Text,Preprocessed_Text,Sentiment,Compound Score,Positive Score,Negative Score,Neutral Score
0,97964e7a9e8eb9cf78f2e4d7b2ff34c7,Hola.,Hola.,hola,Neutral,0.0,0.0,0.0,1.0
1,0158d0d6781fc4d493f243d4caa49747,hi.,hi.,,Neutral,0.0,0.0,0.0,1.0
2,0158d0d6781fc4d493f243d4caa49747,whats up?,whats up?,what,Neutral,0.0,0.0,0.0,1.0
3,97964e7a9e8eb9cf78f2e4d7b2ff34c7,not a ton.,not a ton.,,Neutral,0.0,0.0,0.0,1.0
4,97964e7a9e8eb9cf78f2e4d7b2ff34c7,you?,you?,,Neutral,0.0,0.0,0.0,1.0


In [68]:
pan12 = pan12.dropna()

In [69]:
label = {'Positive':np.nan, 'Neutral':np.nan, 'Negative':'Negative'}
pan12['Bully'] = pan12['Sentiment'].apply(lambda x: label[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [70]:
pan12 = pan12.drop(['Author ID', 'Sentiment',	'Compound Score', 'Positive Score', 'Negative Score', 'Neutral Score'], axis = 1)

In [71]:
pan12.columns

Index(['OG_Text', 'Clean_Text', 'Preprocessed_Text', 'Bully'], dtype='object')

In [72]:
pan12.Bully.value_counts()

Negative    20717
Name: Bully, dtype: int64

In [73]:
mendeley_datasets = {'Aggression': aggr, 
            'Kaggle': kaggle, 
            'Toxicity' : toxicity, 
            'Twitter' : twitter, 
            'Twitter_Racism' : twitter_racism, 
            'Twitter_Sexism' : twitter_sexism, 
            'Youtube' : youtube}

In [74]:
all_datasets = {'Aggression': aggr, 
            'Kaggle': kaggle, 
            'Toxicity' : toxicity, 
            'Twitter' : twitter, 
            'Twitter_Racism' : twitter_racism, 
            'Twitter_Sexism' : twitter_sexism, 
            'Youtube' : youtube, 
            'PAN12' : pan12}

# **Mendeley Data**

In [75]:
men_data = pd.DataFrame()
for i in mendeley_datasets:
  men_data = pd.concat([men_data, mendeley_datasets[i]], ignore_index=True)

In [76]:
men_data.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,`- This is not ``creative``. Those are the di...,,`- This is not ``creative``. Those are the dic...,creativ dictionari definit term insur ensur pr...
1,` :: the term ``standard model`` is itself le...,,` :: the term ``standard model`` is itself les...,term standard model npov think prefer new-ag s...
2,"True or false, the situation as of March 200...",,"True or false, the situation as of March was s...",true fals situat march saudi propos land peac ...
3,"Next, maybe you could work on being less cond...",,"Next, maybe you could work on being less conde...",mayb work condescend suggest read name convent...
4,This page will need disambiguation.,,This page will need disambiguation.,page need disambigu


In [77]:
tot_len = 0
for i in mendeley_datasets:
  tot_len = tot_len + len(mendeley_datasets[i])

print(tot_len)

331768


In [78]:
len(men_data)

331768

In [79]:
men_data.Bully.value_counts()

None          289056
Toxic          15362
Aggression     13590
Sexism          6754
Racism          3940
Misconduct      3066
Name: Bully, dtype: int64

In [80]:
men_data.to_csv('/content/drive/MyDrive/Capstone Project/Data/MENDELEY_DATA.csv', index = False)

# **Active Learning Data**

In [81]:
men_data = pd.read_csv('/content/drive/MyDrive/Capstone Project/Data/MENDELEY_DATA.csv')

In [82]:
al_data = pd.DataFrame()
al_data = pd.concat([men_data, pan12], ignore_index=True)

In [83]:
al_data.head()

Unnamed: 0,OG_Text,Bully,Clean_Text,Preprocessed_Text
0,`- This is not ``creative``. Those are the di...,,`- This is not ``creative``. Those are the dic...,creativ dictionari definit term insur ensur pr...
1,` :: the term ``standard model`` is itself le...,,` :: the term ``standard model`` is itself les...,term standard model npov think prefer new-ag s...
2,"True or false, the situation as of March 200...",,"True or false, the situation as of March was s...",true fals situat march saudi propos land peac ...
3,"Next, maybe you could work on being less cond...",,"Next, maybe you could work on being less conde...",mayb work condescend suggest read name convent...
4,This page will need disambiguation.,,This page will need disambiguation.,page need disambigu


In [84]:
tot_len = 0
for i in all_datasets:
  tot_len = tot_len + len(all_datasets[i])

print(tot_len)

1007071


In [85]:
len(al_data)

1007071

In [86]:
al_data.Bully.value_counts()

None          289056
Negative       20717
Toxic          15362
Aggression     13590
Sexism          6754
Racism          3940
Misconduct      3066
Name: Bully, dtype: int64

In [87]:
al_data.isna().sum()

OG_Text                   0
Bully                654586
Clean_Text              737
Preprocessed_Text      2845
dtype: int64

In [None]:
al_data.to_csv('/content/drive/MyDrive/Capstone Project/Data/ACTLEARN_DATA.csv', index = False)

In [None]:
#al_data['Bully'].mask(al_data['Bully'] == 'Predator', 'Negative', inplace=True)