# Preprocessing 

In this section, preprocessing will be done to the dataset
### Import library

In [None]:
import pandas as pd 
import re
import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

### Import Dataset into DataFrame

In [None]:
# Load csv file into dataframe 
DATADIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/dataset"
main_df = pd.read_csv(f"{DATADIR}/trump_insults_tweets.csv")

### Initial Data Cleaning

In [None]:
# Drop unnecessary columns
main_df = main_df.drop(columns=main_df.columns[0])

# Convert 'data' column to datetime
main_df['date'] = pd.to_datetime(main_df['date'])

# Combine all tweets into a single string 
all_tweets = ' '.join(main_df['tweet'])

main_df.head()

### Define Text Cleaning Function

In [None]:
# function to clean text 
def clean_text(text):
    # convert text to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # tokenize the text
    tokens = word_tokenize(text)

    # remove stopwords 
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # join tokens back into a single string 
    cleaned_text = ' '.join(tokens)

    return cleaned_text

### Apply Cleaning Function to Tweets

In [None]:
# Apply the cleaning function to each tweet
main_df['cleaned_tweet'] = main_df['tweet'].apply(clean_text)

# display the cleaned data
main_df[['tweet', 'cleaned_tweet']].head()

### Save Cleaned Data for Top2Vec
Once the text is cleand, save it to a new CSV file for use with Top2Vec

In [None]:
# save the cleaned tweets to a new CSV file 
main_df[['cleaned_tweet']].to_csv(f"{DATADIR}/cleaned_tweets.csv", index=False)

# Feature Extraction
In this section, feature extraction will be done using Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
### Bag of words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the CountVectorizer() 
vectorizer = CountVectorizer()

# Fit and transform the cleaned tweets 
X_bow = vectorizer.fit_transform(main_df['cleaned_tweet'])

# Convert to DataFrame for better visualisation 
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.head()

### Checking the BoW DataFrame 
To further check the BoW DataFrame, you can examine specific tweets and their corresponding word counts.

In [None]:
# display the first tweet and its BoW representation 
main_df['cleaned_tweet'].iloc[0]
bow_df.iloc[0]

In [None]:
# display the second tweet and its BoW representation 
main_df['cleaned_tweet'].iloc[1]
bow_df.iloc[1]

### Summarising Word Frequencies 

In [None]:
# Sum the coutns for each word across all tweets 
word_frequencies = bow_df.sum(axis=0)

# display t he top 10 most frequent words 
word_frequencies.nlargest(10)

### Visualising Word Frequencies 

In [None]:
# plot the top 10 most frequent words 
import matplotlib.pyplot as plt
import seaborn as sns

top_words = word_frequencies.nlargest(10).reset_index()
top_words.columns = ['word', 'frequency']

plt.figure(figsize=(12, 6))
sns.barplot(x='frequency', y='word', data=top_words, palette='viridis', hue='frequency')
plt.title('Top 10 Most Frequent Words')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

### Term Frequency-Inverse Document Frequency (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TfidfVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# fit and trasform the cleaned tweets
X_tfidf = tfidf_vectorizer.fit_transform(main_df['cleaned_tweet'])

# Convert to DataFrame for better visualisation 
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head()

### Summarising TF-IDF Scores

In [None]:
# Sum the TF-IDF scores for each term 
tfidf_scores = tfidf_df.sum(axis=0)

# Get the top 10 terms with the highest TF-IDF scores 
top_tfidf = tfidf_scores.nlargest(10).reset_index()
top_tfidf.columns = ['term', 'score']

### Visualise the top TF-IDF scores

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='score', y='term', data=top_tfidf, palette='viridis', hue='score')
plt.title('Top 10 Terms by TF-IDF Score')
plt.xlabel('TF-IDF Score')
plt.ylabel('Terms')
plt.show()