# Preprocessing 

### Import library

In [None]:
import pandas as pd 
import re
import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

In [None]:
# Load csv file into dataframe 
DATADIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/dataset"
main_df = pd.read_csv(f"{DATADIR}/trump_insults_tweets.csv")

In [None]:
# Drop unnecessary columns
main_df = main_df.drop(columns=main_df.columns[0])

# Convert 'data' column to datetime
main_df['date'] = pd.to_datetime(main_df['date'])

# Combine all tweets into a single string 
all_tweets = ' '.join(main_df['tweet'])

main_df.head()

In [None]:
# function to clean text 
def clean_text(text):
    # convert text to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # tokenize the text
    tokens = word_tokenize(text)

    # remove stopwords 
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # join tokens back into a single string 
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [None]:
# Apply the cleaning function to each tweet
main_df['cleaned_tweet'] = main_df['tweet'].apply(clean_text)

# display the cleaned data
main_df[['tweet', 'cleaned_tweet']].head()

### Save Cleaned Data for Top2Vec
ONce the text is cleand, save it to a new CSV file for use with Top2Vec

In [None]:
# save the cleaned tweets to a new CSV file 
main_df[['cleaned_tweet']].to_csv(f"{DATADIR}/cleaned_tweets.csv", index=False)