In [None]:
import pandas as pd

# Load all datasets
df1 = pd.read_csv('dataset1.csv')  # assuming tab-separated
df2 = pd.read_csv('dataset2.csv')  # assuming tab-separated
df3 = pd.read_csv('dataset3.csv')  # newly created dataset3

# Process first dataset
df1['label'] = df1['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)
df1 = df1.rename(columns={'tweet_text': 'text'})
df1 = df1[['text', 'label']]  # Keep only needed columns

# Process second dataset
df2['label'] = df2['label'].apply(lambda x: 1 if x == -1 else 0)
df2 = df2.rename(columns={'headline': 'text'})
df2 = df2[['text', 'label']]  # Keep only needed columns

# df3 is already clean, no processing needed (already has 'text' and 'label')

# Merge all datasets
merged_df = pd.concat([df1, df2, df3], ignore_index=True)

merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
# Optional: Save merged dataset
merged_df.to_csv('merged_dataset.csv', index=False)

# Show sample
print(merged_df.head())


                                                text  label
0  That moment you wear your natural hair out wit...      1
1  `  ``L`` ..THANKS... will never do anything bu...      0
2  RT @cornfedbeachbum Women comedians suck #nots...      1
3  Happy Pride Month!! Although I do identify as ...      1
4  nevermind  the  has assimilated those puppies ...      0


In [None]:
# merged_df = pd.read_csv('merged_dataset.csv')

In [None]:
import pandas as pd
import re
import string
import contractions
import demoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# import nltk

# # Download necessary NLTK resources
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt_tab') # Download the missing resource

# # ... (rest of your code) ...

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words_en = set(stopwords.words('english'))

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    hindi_stopwords = set(line.strip() for line in f if line.strip())

# Merge English + Hindi stopwords
stop_words = stop_words_en.union(hindi_stopwords)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def remove_emoji(text):
    return demoji.replace(text, '')
def remove_all_entities(text):
    text = re.sub(r'\r|\n', ' ', text.lower())
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]', '', text)
    banned_list = string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text


In [None]:

def clean_hashtags(tweet):
    new_tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()
    new_tweet = re.sub(r'#([\w-]+)', r'\1', new_tweet).strip()
    return new_tweet

In [None]:

def remove_chars(text):
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

def lemmatize(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [None]:

def remove_short_words(text, min_len=2):
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

def correct_elongated_words(text):
    regular_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regular_pattern, r'\1\3\4', text)

def remove_repeated_punctuation(text):
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

In [None]:
def remove_extra_whitespace(text):
    return ' '.join(text.split())

def remove_url_shorteners(text):
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

def remove_spaces_tweets(tweet):
    return tweet.strip()

def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

In [None]:
def clean_tweet(tweet):
    tweet = remove_emoji(tweet)
    tweet = expand_contractions(tweet)
    tweet = remove_all_entities(tweet)
    tweet = clean_hashtags(tweet)
    tweet = remove_chars(tweet)
    tweet = remove_mult_spaces(tweet)
    tweet = remove_numbers(tweet)
    tweet = lemmatize(tweet)
    tweet = remove_short_words(tweet)
    tweet = correct_elongated_words(tweet)
    tweet = remove_repeated_punctuation(tweet)
    tweet = remove_extra_whitespace(tweet)
    tweet = remove_url_shorteners(tweet)
    tweet = remove_spaces_tweets(tweet)
    tweet = remove_short_tweets(tweet)
    tweet = ' '.join(tweet.split())
    return tweet

In [None]:
# Clean the 'text' column
merged_df['cleaned_text'] = merged_df['text'].apply(clean_tweet)


In [None]:
# Save cleaned dataset
merged_df = merged_df[['cleaned_text', 'label']]
merged_df.to_csv('merged_cleaned_dataset.csv', index=False)

In [None]:
# Count labels 0 and 1
label_counts = merged_df['label'].value_counts()
print("Label Counts:\n", label_counts)

# Unique labels
unique_labels = merged_df['label'].unique()
print("\nUnique Labels:", unique_labels)

# Count unique texts
unique_text_count = merged_df['cleaned_text'].nunique()
print("Unique text count:", unique_text_count)


Label Counts:
 label
0    54432
1    51408
Name: count, dtype: int64

Unique Labels: [1 0]
Unique text count: 93553


In [None]:
# Remove rows where text is empty or only spaces
merged_df = merged_df[merged_df['cleaned_text'].str.strip() != '']

# Remove duplicate texts
merged_df = merged_df.drop_duplicates(subset='cleaned_text')

# Reset index after cleaning
merged_df = merged_df.reset_index(drop=True)

# Now count the labels
label_counts = merged_df['label'].value_counts()
print("Label counts after cleaning:\n", label_counts)


Label counts after cleaning:
 label
0    48330
1    45222
Name: count, dtype: int64


In [None]:
merged_df.to_csv('merged_cleaned_dataset_balanced.csv', index=False)