# Peer Project...Team Akatsuki

In [6]:
import pandas as pd
import re

In [7]:
# --- Configuration ---
file_path = r"D:\Buildables Internship\CyberBullying\data\cyberbullying_tweets.csv"

## Load Dataset
    @peers update the file path if you have csv in anyother folder or etc

In [8]:
# --- Load the dataset ---
try:
    df = pd.read_csv(file_path)
    print(f"Dataset loaded successfully from: {file_path}")
    print("Original DataFrame head:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
    # Exit or handle the error appropriately if the file is not found
    exit()

Dataset loaded successfully from: D:\Buildables Internship\CyberBullying\data\cyberbullying_tweets.csv
Original DataFrame head:
                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


## Data Cleaning

In [9]:
# --- Data Cleaning Steps ---
# Create a new column for cleaned text to preserve the original
df['cleaned_tweet_text'] = df['tweet_text']
print("\nApplying cleaning steps...")


Applying cleaning steps...


In [10]:
# 1. Remove URLs
# Regex to find http, https, or www links
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))
print("Step 1: URLs removed.")

Step 1: URLs removed.


In [11]:
# 2. Remove mentions (@username)
# Regex to find words starting with @
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: re.sub(r'@\w+', '', x))
print("Step 2: Mentions removed.")

Step 2: Mentions removed.


In [12]:
# 3. Remove hashtag symbols (#) but keep the text
# Regex to find # symbol
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: re.sub(r'#', '', x))
print("Step 3: Hashtag symbols removed.")

Step 3: Hashtag symbols removed.


In [13]:
# 4. Remove special characters and numbers, keep only letters and spaces
# Regex to find anything that is NOT a letter or whitespace
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
print("Step 4: Special characters and numbers removed.")

Step 4: Special characters and numbers removed.


In [14]:
# 5. Remove extra whitespace (multiple spaces, leading/trailing spaces)
# Regex to find one or more whitespace characters
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
print("Step 5: Extra whitespace removed.")

Step 5: Extra whitespace removed.


In [15]:
# 6. Convert text to lowercase
df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(lambda x: x.lower())
print("Step 6: Text converted to lowercase.")

Step 6: Text converted to lowercase.


In [16]:
# --- Display the cleaned data ---
print("\nData cleaning complete. Here are the first few rows of the cleaned data:")
print(df[['tweet_text', 'cleaned_tweet_text', 'cyberbullying_type']].head())


Data cleaning complete. Here are the first few rows of the cleaned data:
                                          tweet_text  \
0  In other words #katandandre, your food was cra...   
1  Why is #aussietv so white? #MKR #theblock #ImA...   
2  @XochitlSuckkks a classy whore? Or more red ve...   
3  @Jason_Gio meh. :P  thanks for the heads up, b...   
4  @RudhoeEnglish This is an ISIS account pretend...   

                                  cleaned_tweet_text cyberbullying_type  
0  in other words katandandre your food was crapi...  not_cyberbullying  
1  why is aussietv so white mkr theblock imaceleb...  not_cyberbullying  
2         a classy whore or more red velvet cupcakes  not_cyberbullying  
3  meh p thanks for the heads up but not too conc...  not_cyberbullying  
4  this is an isis account pretending to be a kur...  not_cyberbullying  


In [17]:
#Save the cleaned DataFrame ---
output_file_path = r"D:\Buildables Internship\CyberBullying\data\cyberbullying_tweets_cleaned.csv"
df.to_csv(output_file_path, index=False)
print(f"\nCleaned data saved to '{output_file_path}'")


Cleaned data saved to 'D:\Buildables Internship\CyberBullying\data\cyberbullying_tweets_cleaned.csv'
