## Importing libraries and gathering basic information

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df=pd.read_csv('cyberbullying_tweets.csv',header=None)
df.head(3)

FileNotFoundError: [Errno 2] No such file or directory: 'cyberbullying_tweets.csv'

In [None]:
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)

In [None]:
df.shape

In [None]:
df['cyberbullying_type'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
df=df.drop_duplicates()

In [None]:
df.duplicated().sum()

### Gathering info about hashtags and mentions

In [None]:
import re

# Check if 'tweet_text' column exists
if 'tweet_text' in df.columns:
    # Count hashtags per tweet
    df['num_hashtags'] = df['tweet_text'].apply(lambda x: len(re.findall(r'#\w+', str(x))))

    # Count mentions per tweet
    df['num_mentions'] = df['tweet_text'].apply(lambda x: len(re.findall(r'@\w+', str(x))))

    # Calculate total counts
    total_hashtags = df['num_hashtags'].sum()
    total_mentions = df['num_mentions'].sum()

    print(f"Total number of hashtags in dataset: {total_hashtags}")
    print(f"Total number of mentions in dataset: {total_mentions}")

    # Display sample rows with counts
    print(df[['tweet_text', 'num_hashtags', 'num_mentions']].head(5))
else:
    print("Column 'tweet_text' not found in the dataset. Please verify the column name.")


In [None]:
# finding all hashtags
if 'tweet_text' in df.columns:
  df['hashtags']=df['tweet_text'].apply(lambda x: len(re.findall(r'#\w+',str(x))))
  total_hashtags=(df['hashtags'].sum())
  print(total_hashtags)


In [None]:
df.head(10)

In [None]:
# most used hashtags
from collections import Counter

# Check if 'tweet_text' column exists
if 'tweet_text' in df.columns:
    # Extract all hashtags from the dataset
    all_hashtags = df['tweet_text'].apply(lambda x: re.findall(r'#\w+', str(x))).sum()

    # Count frequency of each hashtag
    hashtag_counts = Counter(all_hashtags)

    # Convert to a DataFrame for better readability
    hashtag_df = pd.DataFrame(hashtag_counts.items(), columns=['Hashtag', 'Count'])
    hashtag_df = hashtag_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    print("Top 10 most frequently used hashtags:")
    print(hashtag_df.head(10))
else:
    print("Column 'tweet_text' not found in the dataset. Please verify the column name.")


In [None]:
df.head()

In [None]:
df.columns

In [None]:
# converting text to lowercase
df=df.applymap(lambda x:x.lower() if isinstance(x,str)else x)

In [None]:


# Step 1: Extract hashtags from the 'tweet_text' column using a regular expression
def extract_hashtags(text):
    return re.findall(r'#\w+', text)

df['hashtags'] = df['tweet_text'].apply(extract_hashtags)

# Step 2: Explode the 'hashtags' column to have each hashtag in its own row
df_exploded = df.explode('hashtags')

# Step 3: Remove any rows where hashtags are missing (if any)
df_exploded = df_exploded[df_exploded['hashtags'].notna()]

# Step 4: Group by 'cyberbullying_type' and 'hashtags' to count occurrences
hashtag_counts = df_exploded.groupby(['cyberbullying_type', 'hashtags']).size().reset_index(name='count')

# Step 5: Find the most frequent hashtag for each 'cyberbullying_type'
most_used_hashtags = hashtag_counts.loc[hashtag_counts.groupby('cyberbullying_type')['count'].idxmax()]

# Step 6: Display the results
print(most_used_hashtags[['cyberbullying_type', 'hashtags', 'count']])


In [None]:
import pandas as pd
import re



# Step 1: Extract mentions (usernames starting with '@') from the 'tweet_text' column using a regular expression
def extract_mentions(text):
    return re.findall(r'@\w+', text)

df['mentions'] = df['tweet_text'].apply(extract_mentions)

# Step 2: Explode the 'mentions' column to have each mention in its own row
df_exploded_mentions = df.explode('mentions')

# Step 3: Remove any rows where mentions are missing (if any)
df_exploded_mentions = df_exploded_mentions[df_exploded_mentions['mentions'].notna()]

# Step 4: Group by 'cyberbullying_type' and 'mentions' to count occurrences
mention_counts = df_exploded_mentions.groupby(['cyberbullying_type', 'mentions']).size().reset_index(name='count')

# Step 5: Find the most frequent mention for each 'cyberbullying_type'
most_used_mentions = mention_counts.loc[mention_counts.groupby('cyberbullying_type')['count'].idxmax()]

# Step 6: Display the results
print(most_used_mentions[['cyberbullying_type', 'mentions', 'count']])


In [None]:
df.head(10)

## Data Preprocessing

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
# loading stopwords
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
df['sentences'] = df['tweet_text'].apply(lambda text: sent_tokenize(text) if isinstance(text, str) else [])
df

In [None]:
# replacing numbers with blank
def num_to_text(tweet):
  for i in df:
    if i.isdigit():
      tweet=tweet.replace(i,'')
  return tweet

num_to_text(df)

In [None]:
# tokenize
df['words']=df['tweet_text'].apply(lambda x:x.split())
df.head(3)

In [None]:
# removing special chars
import re
df['words'] = df['tweet_text'].apply(lambda x: re.findall(r'\b\w+\b', x.lower()))
df.head(3)

### dealing with stopwords

In [None]:

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

In [None]:

from nltk.tokenize import word_tokenize



In [None]:
# Define stopwords
stopwords_set = set(stopwords.words('english'))

# Remove stopwords from the 'words' column
df['words_without_stopwords'] = df['words'].apply(lambda tokens: [word for word in tokens if word not in stopwords_set])


In [None]:
df.head(3)

### lemmetization

In [None]:

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
lemmatizer=WordNetLemmatizer()
df['words_new']=df['words_without_stopwords'].apply(lambda x:[lemmatizer.lemmatize(word)for word in x])

In [None]:
 nltk.download('averaged_perceptron_tagger_eng')

### POS Tagging

In [None]:
from nltk import pos_tag

# Function to tokenize and tag POS
def pos_tagging(text):
    tokens = word_tokenize(text)  # Tokenize text
    tagged_tokens = pos_tag(tokens)  # Tag POS
    return tagged_tokens

# Apply POS tagging to the 'tweet_text' column
df['tweet_text_POS'] = df['tweet_text'].apply(pos_tagging)

In [None]:
df.head(3)

In [None]:
df['tweet_length'] = df['tweet_text'].apply(len)
df.tail(3)

# WORD2VEC

In [None]:
#WORD2VEC

!pip install gensim

In [None]:
from gensim.models import Word2Vec,KeyedVectors

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']

In [None]:


# Train Skip-gram Model
skipgram_model = Word2Vec(
    sentences=df['words_without_stopwords'],  # Use preprocessed words
    vector_size=100,  # Vector dimension
    window=5,
    sg=1,  # Skip-gram
    min_count=1,
    workers=4,
    epochs=10
)

# Function to get average word vectors for each tweet
def get_avg_word_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Apply average word vector transformation
df['tweet_vector'] = df['words_without_stopwords'].apply(lambda x: get_avg_word_vector(x, skipgram_model))

# Convert list of vectors into a DataFrame
tweet_vector_df = pd.DataFrame(df['tweet_vector'].tolist())


In [None]:
# Select numeric features
numeric_features = df[['num_hashtags', 'num_mentions', 'tweet_length']]

# Combine numeric and text vector features


X_combined = pd.concat([numeric_features.reset_index(drop=True), tweet_vector_df.reset_index(drop=True)], axis=1)
y = df['cyberbullying_type']  # Target variable


In [None]:
df.head(3)

## Model Training

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Extract features and target
X = np.array(df['tweet_vector'].tolist())  # Convert list of vectors to a NumPy array
y = df['cyberbullying_type']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Reshape input data to 3D: (samples, timesteps, features)
X_train_rnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_rnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Input

# Build RNN Model
model = Sequential([
    Input(shape=(X_train_rnn.shape[1], 1)),  # Updated input shape
    SimpleRNN(64, activation='tanh', return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Summary
model.summary()


In [None]:
# Train the model
history = model.fit(
    X_train_rnn, y_train,
    validation_data=(X_test_rnn, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)


In [None]:
print("Training Data Shape:", X_train_rnn.shape)  # Should be (samples, 128, 1)
print("Testing Data Shape:", X_test_rnn.shape)    # Should be (samples, 128, 1)
print("Labels Shape:", y_train.shape, y_test.shape)  # Should match number of samples

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))


In [None]:
df.head(2)