## Importing libraries and gathering basic information

In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [189]:
df=pd.read_csv('cyberbullying_tweets.XLS',header=None)
df.head(3)

Unnamed: 0,0,1
0,tweet_text,cyberbullying_type
1,"In other words #katandandre, your food was cra...",not_cyberbullying
2,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying


In [190]:
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)

In [191]:
df.shape

(47692, 2)

In [192]:
df['cyberbullying_type'].value_counts()

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [193]:
df.isnull().sum()

0
tweet_text            0
cyberbullying_type    0
dtype: int64

In [194]:
df[df.duplicated()]

Unnamed: 0,tweet_text,cyberbullying_type
1758,Our pancakes are selling like hotcakes Shaz - ...,not_cyberbullying
3833,But you all respect him....Pete hasn't read tw...,not_cyberbullying
3939,This is the opportunity to prove ourselves lik...,not_cyberbullying
5684,Strategicscoring should be classed as cheating...,not_cyberbullying
6975,"If we're at the bottom of the leaderboard, we'...",not_cyberbullying
7227,It wouldn't be fair. Kat knows NOTHING of fair...,not_cyberbullying
7278,@TVWEEKmag: There is only 1 way to stay in the...,not_cyberbullying
7822,@Ima_TV_Junkie: What the hell were Annie and L...,not_cyberbullying
9672,@victorymonk: #sorryitsaboy joke means more bo...,gender
9835,@thisonesakillaa: In my opinion? All jokes are...,gender


In [195]:
df=df.drop_duplicates()

In [196]:
df.duplicated().sum()

0

### Gathering info about hashtags and mentions

In [198]:
import re

# Check if 'tweet_text' column exists
if 'tweet_text' in df.columns:
    # Count hashtags per tweet
    df['num_hashtags'] = df['tweet_text'].apply(lambda x: len(re.findall(r'#\w+', str(x))))

    # Count mentions per tweet
    df['num_mentions'] = df['tweet_text'].apply(lambda x: len(re.findall(r'@\w+', str(x))))

    # Calculate total counts
    total_hashtags = df['num_hashtags'].sum()
    total_mentions = df['num_mentions'].sum()

    print(f"Total number of hashtags in dataset: {total_hashtags}")
    print(f"Total number of mentions in dataset: {total_mentions}")

    # Display sample rows with counts
    print(df[['tweet_text', 'num_hashtags', 'num_mentions']].head(5))
else:
    print("Column 'tweet_text' not found in the dataset. Please verify the column name.")


Total number of hashtags in dataset: 11222
Total number of mentions in dataset: 26981
0                                         tweet_text  num_hashtags  \
0  In other words #katandandre, your food was cra...             2   
1  Why is #aussietv so white? #MKR #theblock #ImA...            10   
2  @XochitlSuckkks a classy whore? Or more red ve...             0   
3  @Jason_Gio meh. :P  thanks for the heads up, b...             0   
4  @RudhoeEnglish This is an ISIS account pretend...             0   

0  num_mentions  
0             0  
1             0  
2             1  
3             1  
4             1  


In [199]:
# finding all hashtags
if 'tweet_text' in df.columns:
  df['hashtags']=df['tweet_text'].apply(lambda x: len(re.findall(r'#\w+',str(x))))
  total_hashtags=(df['hashtags'].sum())
  print(total_hashtags)


11222


In [200]:
df.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags
0,"In other words #katandandre, your food was cra...",not_cyberbullying,2,0,2
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,10,0,10
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0,1,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0,1,0
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0,1,0
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,0,2,0
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,0,0,0
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,1,0,1
8,@stockputout everything but mostly my priest,not_cyberbullying,0,1,0
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,0,0,0


In [201]:
# most used hashtags
from collections import Counter

# Check if 'tweet_text' column exists
if 'tweet_text' in df.columns:
    # Extract all hashtags from the dataset
    all_hashtags = df['tweet_text'].apply(lambda x: re.findall(r'#\w+', str(x))).sum()

    # Count frequency of each hashtag
    hashtag_counts = Counter(all_hashtags)

    # Convert to a DataFrame for better readability
    hashtag_df = pd.DataFrame(hashtag_counts.items(), columns=['Hashtag', 'Count'])
    hashtag_df = hashtag_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    print("Top 10 most frequently used hashtags:")
    print(hashtag_df.head(10))
else:
    print("Column 'tweet_text' not found in the dataset. Please verify the column name.")


Top 10 most frequently used hashtags:
           Hashtag  Count
0             #MKR   1312
1             #mkr   1186
2           #Islam    145
3  #BlameOneNotAll    117
4       #notsexist    104
5            #ISIS     94
6         #MKR2015     85
7            #coon     75
8      #MileyCyrus     68
9         #mkr2015     65


In [202]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags
0,"In other words #katandandre, your food was cra...",not_cyberbullying,2,0,2
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,10,0,10
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0,1,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0,1,0
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0,1,0


In [203]:
df.columns

Index(['tweet_text', 'cyberbullying_type', 'num_hashtags', 'num_mentions',
       'hashtags'],
      dtype='object', name=0)

In [204]:
# converting text to lowercase
df=df.applymap(lambda x:x.lower() if isinstance(x,str)else x)

  df=df.applymap(lambda x:x.lower() if isinstance(x,str)else x)


In [205]:


# Step 1: Extract hashtags from the 'tweet_text' column using a regular expression
def extract_hashtags(text):
    return re.findall(r'#\w+', text)

df['hashtags'] = df['tweet_text'].apply(extract_hashtags)

# Step 2: Explode the 'hashtags' column to have each hashtag in its own row
df_exploded = df.explode('hashtags')

# Step 3: Remove any rows where hashtags are missing (if any)
df_exploded = df_exploded[df_exploded['hashtags'].notna()]

# Step 4: Group by 'cyberbullying_type' and 'hashtags' to count occurrences
hashtag_counts = df_exploded.groupby(['cyberbullying_type', 'hashtags']).size().reset_index(name='count')

# Step 5: Find the most frequent hashtag for each 'cyberbullying_type'
most_used_hashtags = hashtag_counts.loc[hashtag_counts.groupby('cyberbullying_type')['count'].idxmax()]

# Step 6: Display the results
print(most_used_hashtags[['cyberbullying_type', 'hashtags', 'count']])


       cyberbullying_type hashtags  count
426                   age  #sffpit     18
1112            ethnicity  #racism     36
1976               gender     #mkr    628
2991    not_cyberbullying     #mkr   1594
3865  other_cyberbullying     #mkr    285
4607             religion   #islam    131


In [206]:
import pandas as pd
import re



# Step 1: Extract mentions (usernames starting with '@') from the 'tweet_text' column using a regular expression
def extract_mentions(text):
    return re.findall(r'@\w+', text)

df['mentions'] = df['tweet_text'].apply(extract_mentions)

# Step 2: Explode the 'mentions' column to have each mention in its own row
df_exploded_mentions = df.explode('mentions')

# Step 3: Remove any rows where mentions are missing (if any)
df_exploded_mentions = df_exploded_mentions[df_exploded_mentions['mentions'].notna()]

# Step 4: Group by 'cyberbullying_type' and 'mentions' to count occurrences
mention_counts = df_exploded_mentions.groupby(['cyberbullying_type', 'mentions']).size().reset_index(name='count')

# Step 5: Find the most frequent mention for each 'cyberbullying_type'
most_used_mentions = mention_counts.loc[mention_counts.groupby('cyberbullying_type')['count'].idxmax()]

# Step 6: Display the results
print(most_used_mentions[['cyberbullying_type', 'mentions', 'count']])


        cyberbullying_type          mentions  count
301                    age  @realdonaldtrump     15
3231             ethnicity        @tayyoung_    959
5646                gender            @mt8_9     89
8016     not_cyberbullying      @freebsdgirl     75
11597  other_cyberbullying      @freebsdgirl    151
14741             religion    @maxblumenthal    119


In [207]:
df.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[]
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[]
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks]
3,"@jason_gio meh. :p thanks for the heads up, b...",not_cyberbullying,0,1,[],[@jason_gio]
4,@rudhoeenglish this is an isis account pretend...,not_cyberbullying,0,1,[],[@rudhoeenglish]
5,"@raja5aab @quickieleaks yes, the test of god i...",not_cyberbullying,0,2,[],"[@raja5aab, @quickieleaks]"
6,itu sekolah ya bukan tempat bully! ga jauh kay...,not_cyberbullying,0,0,[],[]
7,karma. i hope it bites kat on the butt. she is...,not_cyberbullying,1,0,[#mkr],[]
8,@stockputout everything but mostly my priest,not_cyberbullying,0,1,[],[@stockputout]
9,rebecca black drops out of school due to bully...,not_cyberbullying,0,0,[],[]


## Data Preprocessing

In [209]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [210]:
# loading stopwords
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
df['sentences'] = df['tweet_text'].apply(lambda text: sent_tokenize(text) if isinstance(text, str) else [])
df

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ..."
3,"@jason_gio meh. :p thanks for the heads up, b...",not_cyberbullying,0,1,[],[@jason_gio],"[@jason_gio meh., :p thanks for the heads up,..."
4,@rudhoeenglish this is an isis account pretend...,not_cyberbullying,0,1,[],[@rudhoeenglish],[@rudhoeenglish this is an isis account preten...
...,...,...,...,...,...,...,...
47687,"black ppl aren't expected to do anything, depe...",ethnicity,0,0,[],[],"[black ppl aren't expected to do anything, dep..."
47688,turner did not withhold his disappointment. tu...,ethnicity,0,0,[],[],"[turner did not withhold his disappointment., ..."
47689,i swear to god. this dumb nigger bitch. i have...,ethnicity,0,0,[],[],"[i swear to god., this dumb nigger bitch., i h..."
47690,yea fuck you rt @therealexel: if youre a nigge...,ethnicity,0,1,[],[@therealexel],[yea fuck you rt @therealexel: if youre a nigg...


In [211]:
# replacing numbers with blank
def num_to_text(tweet):
  for i in df:
    if i.isdigit():
      tweet=tweet.replace(i,'')
  return tweet

num_to_text(df)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ..."
3,"@jason_gio meh. :p thanks for the heads up, b...",not_cyberbullying,0,1,[],[@jason_gio],"[@jason_gio meh., :p thanks for the heads up,..."
4,@rudhoeenglish this is an isis account pretend...,not_cyberbullying,0,1,[],[@rudhoeenglish],[@rudhoeenglish this is an isis account preten...
...,...,...,...,...,...,...,...
47687,"black ppl aren't expected to do anything, depe...",ethnicity,0,0,[],[],"[black ppl aren't expected to do anything, dep..."
47688,turner did not withhold his disappointment. tu...,ethnicity,0,0,[],[],"[turner did not withhold his disappointment., ..."
47689,i swear to god. this dumb nigger bitch. i have...,ethnicity,0,0,[],[],"[i swear to god., this dumb nigger bitch., i h..."
47690,yea fuck you rt @therealexel: if youre a nigge...,ethnicity,0,1,[],[@therealexel],[yea fuck you rt @therealexel: if youre a nigg...


In [212]:
# tokenize
df['words']=df['tweet_text'].apply(lambda x:x.split())
df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, #katandandre,, your, food, ..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, #aussietv, so, white?, #mkr, #theblo..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ...","[@xochitlsuckkks, a, classy, whore?, or, more,..."


In [213]:
# removing special chars
import re
df['words'] = df['tweet_text'].apply(lambda x: re.findall(r'\b\w+\b', x.lower()))
df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, katandandre, your, food, wa..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, aussietv, so, white, mkr, theblock, ..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ...","[xochitlsuckkks, a, classy, whore, or, more, r..."


### dealing with stopwords

In [215]:

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [216]:

from nltk.tokenize import word_tokenize



In [217]:
# Define stopwords
stopwords_set = set(stopwords.words('english'))

# Remove stopwords from the 'words' column
df['words_without_stopwords'] = df['words'].apply(lambda tokens: [word for word in tokens if word not in stopwords_set])


In [218]:
df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words,words_without_stopwords
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, katandandre, your, food, wa...","[words, katandandre, food, crapilicious, mkr]"
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, aussietv, so, white, mkr, theblock, ...","[aussietv, white, mkr, theblock, imacelebritya..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ...","[xochitlsuckkks, a, classy, whore, or, more, r...","[xochitlsuckkks, classy, whore, red, velvet, c..."


### lemmetization

In [220]:

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [221]:
lemmatizer=WordNetLemmatizer()
df['words_new']=df['words_without_stopwords'].apply(lambda x:[lemmatizer.lemmatize(word)for word in x])

In [222]:
 nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

### POS Tagging

In [224]:
from nltk import pos_tag

# Function to tokenize and tag POS
def pos_tagging(text):
    tokens = word_tokenize(text)  # Tokenize text
    tagged_tokens = pos_tag(tokens)  # Tag POS
    return tagged_tokens

# Apply POS tagging to the 'tweet_text' column
df['tweet_text_POS'] = df['tweet_text'].apply(pos_tagging)

In [225]:
df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words,words_without_stopwords,words_new,tweet_text_POS
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, katandandre, your, food, wa...","[words, katandandre, food, crapilicious, mkr]","[word, katandandre, food, crapilicious, mkr]","[(in, IN), (other, JJ), (words, NNS), (#, #), ..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, aussietv, so, white, mkr, theblock, ...","[aussietv, white, mkr, theblock, imacelebritya...","[aussietv, white, mkr, theblock, imacelebritya...","[(why, WRB), (is, VBZ), (#, #), (aussietv, RB)..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ...","[xochitlsuckkks, a, classy, whore, or, more, r...","[xochitlsuckkks, classy, whore, red, velvet, c...","[xochitlsuckkks, classy, whore, red, velvet, c...","[(@, NN), (xochitlsuckkks, VBZ), (a, DT), (cla..."


In [226]:
df['tweet_length'] = df['tweet_text'].apply(len)
df.tail(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words,words_without_stopwords,words_new,tweet_text_POS,tweet_length
47689,i swear to god. this dumb nigger bitch. i have...,ethnicity,0,0,[],[],"[i swear to god., this dumb nigger bitch., i h...","[i, swear, to, god, this, dumb, nigger, bitch,...","[swear, god, dumb, nigger, bitch, got, bleach,...","[swear, god, dumb, nigger, bitch, got, bleach,...","[(i, NN), (swear, VBP), (to, TO), (god, VB), (...",104
47690,yea fuck you rt @therealexel: if youre a nigge...,ethnicity,0,1,[],[@therealexel],[yea fuck you rt @therealexel: if youre a nigg...,"[yea, fuck, you, rt, therealexel, if, youre, a...","[yea, fuck, rt, therealexel, youre, nigger, fu...","[yea, fuck, rt, therealexel, youre, nigger, fu...","[(yea, RB), (fuck, NN), (you, PRP), (rt, VBP),...",90
47691,bro. u gotta chill rt @chillshrammy: dog fuck ...,ethnicity,0,1,[],[@chillshrammy],"[bro., u gotta chill rt @chillshrammy: dog fuc...","[bro, u, gotta, chill, rt, chillshrammy, dog, ...","[bro, u, gotta, chill, rt, chillshrammy, dog, ...","[bro, u, gotta, chill, rt, chillshrammy, dog, ...","[(bro, NN), (., .), (u, JJ), (got, VBD), (ta, ...",76


# WORD2VEC

In [228]:
#WORD2VEC

!pip install gensim



In [229]:
from gensim.models import Word2Vec,KeyedVectors

In [230]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']

In [231]:


# Train Skip-gram Model
skipgram_model = Word2Vec(
    sentences=df['words_without_stopwords'],  # Use preprocessed words
    vector_size=100,  # Vector dimension
    window=5,
    sg=1,  # Skip-gram
    min_count=1,
    workers=4,
    epochs=10
)

# Function to get average word vectors for each tweet
def get_avg_word_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Apply average word vector transformation
df['tweet_vector'] = df['words_without_stopwords'].apply(lambda x: get_avg_word_vector(x, skipgram_model))

# Convert list of vectors into a DataFrame
tweet_vector_df = pd.DataFrame(df['tweet_vector'].tolist())


In [232]:
# Select numeric features
numeric_features = df[['num_hashtags', 'num_mentions', 'tweet_length']]

# Combine numeric and text vector features


X_combined = pd.concat([numeric_features.reset_index(drop=True), tweet_vector_df.reset_index(drop=True)], axis=1)
y = df['cyberbullying_type']  # Target variable


In [233]:
df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words,words_without_stopwords,words_new,tweet_text_POS,tweet_length,tweet_vector
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, katandandre, your, food, wa...","[words, katandandre, food, crapilicious, mkr]","[word, katandandre, food, crapilicious, mkr]","[(in, IN), (other, JJ), (words, NNS), (#, #), ...",61,"[-0.19821869, -0.16141829, 0.45322648, -0.4486..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, aussietv, so, white, mkr, theblock, ...","[aussietv, white, mkr, theblock, imacelebritya...","[aussietv, white, mkr, theblock, imacelebritya...","[(why, WRB), (is, VBZ), (#, #), (aussietv, RB)...",115,"[-0.14349976, -0.15604629, 0.2932265, -0.14223..."
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying,0,1,[],[@xochitlsuckkks],"[@xochitlsuckkks a classy whore?, or more red ...","[xochitlsuckkks, a, classy, whore, or, more, r...","[xochitlsuckkks, classy, whore, red, velvet, c...","[xochitlsuckkks, classy, whore, red, velvet, c...","[(@, NN), (xochitlsuckkks, VBZ), (a, DT), (cla...",60,"[-0.11901531, -0.12100335, 0.3366897, 0.076935..."


## Model Training

In [235]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Extract features and target
X = np.array(df['tweet_vector'].tolist())  # Convert list of vectors to a NumPy array
y = df['cyberbullying_type']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [236]:
# Reshape input data to 3D: (samples, timesteps, features)
X_train_rnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_rnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


In [237]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Input

# Build RNN Model
model = Sequential([
    Input(shape=(X_train_rnn.shape[1], 1)),  # Updated input shape
    SimpleRNN(64, activation='tanh', return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Summary
model.summary()


In [238]:
# Train the model
history = model.fit(
    X_train_rnn, y_train,
    validation_data=(X_test_rnn, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)


Epoch 1/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.4901 - loss: 1.2994 - val_accuracy: 0.7051 - val_loss: 0.7761
Epoch 2/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.6821 - loss: 0.8553 - val_accuracy: 0.7462 - val_loss: 0.6696
Epoch 3/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.7106 - loss: 0.7770 - val_accuracy: 0.7518 - val_loss: 0.6131
Epoch 4/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.7209 - loss: 0.7535 - val_accuracy: 0.7483 - val_loss: 0.6647
Epoch 5/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.7271 - loss: 0.7372 - val_accuracy: 0.7655 - val_loss: 0.6084
Epoch 6/10
[1m1192/1192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.7306 - loss: 0.7252 - val_accuracy: 0.7515 - val_loss: 0.6317
Epoc

In [239]:
print("Training Data Shape:", X_train_rnn.shape)  # Should be (samples, 128, 1)
print("Testing Data Shape:", X_test_rnn.shape)    # Should be (samples, 128, 1)
print("Labels Shape:", y_train.shape, y_test.shape)  # Should match number of samples

Training Data Shape: (38124, 100, 1)
Testing Data Shape: (9532, 100, 1)
Labels Shape: (38124,) (9532,)


In [240]:
# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
                     precision    recall  f1-score   support

                age       0.88      0.97      0.93      1602
          ethnicity       0.94      0.94      0.94      1636
             gender       0.85      0.75      0.80      1514
  not_cyberbullying       0.51      0.52      0.52      1624
other_cyberbullying       0.57      0.49      0.53      1594
           religion       0.83      0.95      0.89      1562

           accuracy                           0.77      9532
          macro avg       0.77      0.77      0.77      9532
       weighted avg       0.77      0.77      0.77      9532



In [241]:
df.head(2)

Unnamed: 0,tweet_text,cyberbullying_type,num_hashtags,num_mentions,hashtags,mentions,sentences,words,words_without_stopwords,words_new,tweet_text_POS,tweet_length,tweet_vector
0,"in other words #katandandre, your food was cra...",not_cyberbullying,2,0,"[#katandandre, #mkr]",[],"[in other words #katandandre, your food was cr...","[in, other, words, katandandre, your, food, wa...","[words, katandandre, food, crapilicious, mkr]","[word, katandandre, food, crapilicious, mkr]","[(in, IN), (other, JJ), (words, NNS), (#, #), ...",61,"[-0.19821869, -0.16141829, 0.45322648, -0.4486..."
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying,10,0,"[#aussietv, #mkr, #theblock, #imacelebrityau, ...",[],"[why is #aussietv so white?, #mkr #theblock #i...","[why, is, aussietv, so, white, mkr, theblock, ...","[aussietv, white, mkr, theblock, imacelebritya...","[aussietv, white, mkr, theblock, imacelebritya...","[(why, WRB), (is, VBZ), (#, #), (aussietv, RB)...",115,"[-0.14349976, -0.15604629, 0.2932265, -0.14223..."


In [294]:
y_pred = model.predict(X_test)


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [307]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)  # Convert probabilities to class labels
print(classification_report(y_test, y_pred_classes))
print(confusion_matrix(y_test, y_pred_classes))


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.88      0.97      0.93      1602
           1       0.94      0.94      0.94      1636
           2       0.85      0.75      0.80      1514
           3       0.51      0.52      0.52      1624
           4       0.57      0.49      0.53      1594
           5       0.83      0.95      0.89      1562

    accuracy                           0.77      9532
   macro avg       0.77      0.77      0.77      9532
weighted avg       0.77      0.77      0.77      9532

[[1556    8    2   13   17    6]
 [   3 1543   14    4   43   29]
 [   7   20 1128  237   92   30]
 [ 110   24   67  845  423  155]
 [  80   46   93  502  787   86]
 [   3    2   16   45    8 1488]]
