In [None]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.data_cleaning import clean_tweet

In [None]:
# Load data into pandas DataFrame
df = pd.read_csv('data/cyberbullying_tweets.csv')
df.head()

In [None]:
# Get info about data
df.info()

In [None]:
# Rename feature and label columns
df = df.rename(columns={'tweet_text': 'tweet', 'cyberbullying_type': 'category'})

# Remove duplicates
df = df[~df.duplicated()]

In [None]:
# Check distribution for cyberbullying category
category_counts = df['category'].value_counts()

# Create a pie chart
plt.figure(figsize=(4, 4))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff6666'])
plt.title('Number of tweets for each category')
plt.show()

In [None]:
# Clean tweets using functions defined in src, and drop duplicates
# Define stop words for text cleaning
stop_words = set(stopwords.words('english'))
# Initialize lemmatizer for text cleaning
lemmatizer = WordNetLemmatizer()

df['clean_tweet'] = [clean_tweet(tweet, lemmatizer, stop_words) for tweet in df['tweet']]
df.drop_duplicates('clean_tweet', inplace=True)

category_counts = df['category'].value_counts()

# Create a pie chart
plt.figure(figsize=(4, 4))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff6666'])
plt.title('Number of tweets for each category')
plt.show()

# After cleaning we can see that category "other_cyberbullying" is reduced to around 12%, because it is too generic

In [None]:
# Remove "other_cyberbullying category" due to its incosistency and being too generic
df = df[df['category'] != 'other_cyberbullying']