# Import libraries

In [6]:
# RUN this cell if you run this code for the first time
# !pip install -r requirements.txt
# only run following if you have not yet downloaded the stopwords
# nltk.download('stopwords') 
# nltk.download('punkt')
# nltk.download('wordnet')

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

# Read in the dataframes

In [8]:
df_swearwords = pd.read_csv("data/output.csv")
df_political_leaning = pd.read_csv("data/political_leaning.csv")

# Data Cleaning

In [9]:
def preprocess(text):
    text = text.lower()

    text  = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]',"", text)

    tokens = nltk.word_tokenize(text)

    return tokens

def remove_stopwords (tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

def lemmatization(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized

def clean_text(text):
    tokens = preprocess(text)
    filtered_tokens = remove_stopwords(tokens)
    lemmatizer = lemmatization(filtered_tokens)
    clean = " ".join(lemmatizer)
    return clean

In [10]:
# Apply the cleaning function to the dataframe
df_political_leaning['new'] = tqdm(df_political_leaning['post'].apply(clean_text))

  0%|          | 0/57231 [00:00<?, ?it/s]

In [11]:
# Assume df_political_leaning is your DataFrame and 'text_column' is the column you want to check
df_political_leaning['new_column'] = df_political_leaning['new'].apply(lambda x: sum([1 for word in x.split() if word in df_swearwords['Word'].values]))
df_political_leaning = df_political_leaning.rename(columns={'new': 'cleaned_post', 'new_column': 'amount_of_cursewords'})
df_political_leaning = df_political_leaning.rename(columns={'auhtor_ID': 'username'})
df_political_leaning.head()

In [None]:
# encode political leaning as a number in 'political_leaning_id'where left is -1, center is 0, and right is 1
df_political_leaning['political_leaning_id'] = df_political_leaning['political_leaning'].apply(lambda x: -1 if x == 'left' else 0 if x == 'center' else 1)

In [None]:
df_political_leaning.to_csv("cleaned_data.csv")