In [183]:
import pandas as pd
import os
import re
import ast
from textblob import TextBlob
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet, stopwords, words
from collections import Counter
# from spellchecker import SpellChecker

In [127]:
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
# nltk.download('stopwords')
#nltk.download('words')

### Important Word Set

In [143]:
STOPWORDS = set(stopwords.words('english'))
english_words = set(words.words())


def preprocess_text(text):
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  
    words = text.split()  
    words = [word for word in words if word not in STOPWORDS] 
    return words

def filter_non_standard_words(top_words):
    non_standard_words = [word for word, _ in top_words if word not in english_words]
    return non_standard_words

def word_count_in_folder(folder_path, top_k=100):
    all_words = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            
            if 'title' in df.columns and 'sentence' in df.columns:
                for _, row in df.iterrows():
                    text = row['title'] + " " + row['sentence']
                    words = preprocess_text(text)
                    all_words.extend(words)  

    word_counts = Counter(all_words)
    top_words = word_counts.most_common(top_k)

    non_standard_words = filter_non_standard_words(top_words)
    
    return non_standard_words

folder_path = 'data' 
file_top_words = word_count_in_folder(folder_path, top_k=100) 
print(file_top_words)

['nt', 'ipod', 'software', 'zen', 'dvd', 'mp3', 'problems', 'features', 'players', '3', 'xtra', 'g3', 'pictures', 'christmas', '2', 'dvds', 'flaws', 'songs', 'using']


The reason we need to look at the important wordset is because this would let us assess if we should use TextBlob's `correct` function to fix the spellings within the text data. However, we have skipped correcting the spellings because these important words would also get translated and might introduce emotion into the data that did not exist before correction.

### Lowercase text data

In [347]:
def data_preprocessing(df):
    
    def selective_lowercase(text):
        return ' '.join([word if word.isupper() else word.lower() for word in text.split()])
    
    df['title'] = df['title'].apply(selective_lowercase)
    df['sentence'] = df['sentence'].apply(selective_lowercase)

    def clean_text(text):
        text = contractions.fix(text)
        # blob = TextBlob(text)
        # expanded_text = str(blob.correct())
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^a-zA-Z0-9\s!?-]', '', text)
        
        return text
    
    df['title'] = df['title'].apply(clean_text)
    df['sentence'] = df['sentence'].apply(clean_text)

    lemmatizer = WordNetLemmatizer()
    
    def get_wordnet_pos(nltk_pos):
        if nltk_pos.startswith('J'):
            return wordnet.ADJ
        elif nltk_pos.startswith('V'):
            return wordnet.VERB
        elif nltk_pos.startswith('N'):
            return wordnet.NOUN
        elif nltk_pos.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def conditional_lemmatize(text):
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)
        lemmatized_text = []
        
        for token, pos in pos_tags:
            wordnet_pos = get_wordnet_pos(pos)
            if wordnet_pos in [wordnet.VERB, wordnet.NOUN]: 
                lemmatized_text.append(lemmatizer.lemmatize(token, pos=wordnet_pos))
            else:
                lemmatized_text.append(token)
        return ' '.join(lemmatized_text)
    
    df['title'] = df['title'].apply(conditional_lemmatize)
    df['sentence'] = df['sentence'].apply(conditional_lemmatize)
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

    # NLTK Tokenizer
    #df['title'] = df['title'].apply(word_tokenize)
    #df['sentence'] = df['sentence'].apply(word_tokenize)
    
    def tokenize_text(title, sentence):
        title_encoding = tokenizer(title, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        sentence_encoding = tokenizer(sentence, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        
        return title_encoding['input_ids'].flatten().tolist(), title_encoding['attention_mask'].flatten().tolist(), \
               sentence_encoding['input_ids'].flatten().tolist(), sentence_encoding['attention_mask'].flatten().tolist()
    
    
    df[['title_input_ids', 'title_attention_mask', 'sentence_input_ids', 'sentence_attention_mask']] = df.apply(
        lambda row: tokenize_text(row['title'], row['sentence']), axis=1, result_type='expand'
    )

    return df

In [349]:
data = {}
for _ in os.listdir():
    if _.endswith('.csv'):
        df = pd.read_csv('data/'+_)
        data[_] = data_preprocessing(df)
        data[_].to_csv('processed_data/' + _.replace('.csv', '_updated.csv'))