In [48]:
import pandas as pd

df1 = pd.read_csv('dataset1.csv')  # assuming tab-separated
df2 = pd.read_csv('dataset2.csv')  # assuming tab-separated

# Process first dataset
df1['label'] = df1['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)
df1 = df1.rename(columns={'tweet_text': 'text'})
df1 = df1[['text', 'label']]  # Keep only needed columns

# Process second dataset
df2['label'] = df2['label'].apply(lambda x: 1 if x == -1 else 0)
df2 = df2.rename(columns={'headline': 'text'})
df2 = df2[['text', 'label']]  # Keep only needed columns

# Merge datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

# Optional: Save merged dataset
merged_df.to_csv('merged_dataset.csv', index=False)

# Show sample
print(merged_df.head())

                                                text  label
0  In other words #katandandre, your food was cra...      0
1  Why is #aussietv so white? #MKR #theblock #ImA...      0
2  @XochitlSuckkks a classy whore? Or more red ve...      0
3  @Jason_Gio meh. :P  thanks for the heads up, b...      0
4  @RudhoeEnglish This is an ISIS account pretend...      0


In [49]:
import pandas as pd
import re
import string
import demoji
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [50]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the missing resource

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/adimundada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/adimundada/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/adimundada/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adimundada/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [51]:
stop_words_en = set(stopwords.words('english'))

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    hindi_stopwords = set(line.strip() for line in f if line.strip())

# Merge English + Hindi stopwords
stop_words = stop_words_en.union(hindi_stopwords)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [52]:
def remove_emoji(text):
    return demoji.replace(text, '')
def remove_all_entities(text):
    text = re.sub(r'\r|\n', ' ', text.lower())
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]', '', text)
    banned_list = string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text


In [53]:

def clean_hashtags(tweet):
    new_tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()
    new_tweet = re.sub(r'#([\w-]+)', r'\1', new_tweet).strip()
    return new_tweet

In [54]:

def remove_chars(text):
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)

In [55]:
def expand_contractions(text):
    return contractions.fix(text)

def lemmatize(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [56]:

def remove_short_words(text, min_len=2):
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

def correct_elongated_words(text):
    regular_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regular_pattern, r'\1\3\4', text)

def remove_repeated_punctuation(text):
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

In [57]:
def remove_extra_whitespace(text):
    return ' '.join(text.split())

def remove_url_shorteners(text):
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

def remove_spaces_tweets(tweet):
    return tweet.strip()

def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

In [58]:
def clean_tweet(tweet):
    tweet = remove_emoji(tweet)
    tweet = expand_contractions(tweet)
    tweet = remove_all_entities(tweet)
    tweet = clean_hashtags(tweet)
    tweet = remove_chars(tweet)
    tweet = remove_mult_spaces(tweet)
    tweet = remove_numbers(tweet)
    tweet = lemmatize(tweet)
    tweet = remove_short_words(tweet)
    tweet = correct_elongated_words(tweet)
    tweet = remove_repeated_punctuation(tweet)
    tweet = remove_extra_whitespace(tweet)
    tweet = remove_url_shorteners(tweet)
    tweet = remove_spaces_tweets(tweet)
    tweet = remove_short_tweets(tweet)
    tweet = ' '.join(tweet.split())
    return tweet

In [None]:
# Clean the 'text' column
merged_df['cleaned_text'] = merged_df['text'].apply(clean_tweet)



In [60]:
# Save cleaned dataset
merged_df = merged_df[['cleaned_text', 'label']]
merged_df.to_csv('merged_cleaned_dataset.csv', index=False)