In [119]:
def load_data(path, test=False):
    lines = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
    if test:
        ids = [l.split(',')[0] for l in lines]
        return ids, lines
    else:
        return lines

In [163]:
pos = load_data('data/train_pos.txt')
neg = load_data('data/train_neg.txt')

In [164]:
import string

def clean(tweet):
    lower = tweet.lower()
    no_user_token = lower.replace('<user>', '')
    no_token = no_user_token.replace('<url>', '')
    no_punct = no_token.translate(str.maketrans('', '', string.punctuation))
    no_digit = [w for w in no_punct.split() if not w.isdigit()]
    return no_digit

In [165]:
clean_pos = [clean(t) for t in pos]
clean_neg = [clean(t) for t in neg]

In [173]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize(words):
    lem = WordNetLemmatizer()
    verbs = [lem.lemmatize(w, 'v') for w in words] # verbs
    return [lem.lemmatize(w, 'n') for w in verbs] # nouns

In [174]:
lemmatized_pos = [lemmatize(w) for w in clean_pos]
lemmatized_neg = [lemmatize(w) for w in clean_neg]

In [43]:
def save_cleaned(path, data):
    with open(path, 'w', encoding='utf-8') as f:
        for words in data:
            f.write(' '.join(words))
            f.write('\n')

In [46]:
save_cleaned('data/clean_train_pos.txt', lemmatized_pos)
save_cleaned('data/clean_train_neg.txt', lemmatized_neg)

In [175]:
from collections import Counter
counter_lem_pos = Counter()
counter_lem = Counter()
for t in lemmatized_pos:
    for w in t:
        counter_lem_pos[w]+=1
        counter_lem[w]+=1
counter_lem_neg = Counter()
for t in lemmatized_neg:
    for w in t:
        counter_lem_neg[w]+=1
        counter_lem[w]+=1

In [186]:
clean_lem_counter = {x : counter_lem[x] for x in counter_lem if counter_lem[x] > 50}
len(clean_lem_counter)

3330

In [187]:
# most common in both
thresh = 100
most_common_pos = [w for w,c in counter_lem_pos.most_common()[:thresh]]
most_common_neg = [w for w,c in counter_lem_neg.most_common()[:thresh]]

In [188]:
most_common_in_both = []
for w in most_common_pos:
    if w in most_common_neg:
        most_common_in_both.append(w)
#most_common_in_both

In [189]:
# remove most common in both
clean_lem_counter_2 = counter_lem.copy()
for w in most_common_in_both:
    clean_lem_counter_2.pop(w)

In [192]:
# remove words that appear scarcely
thresh_2 = 50
chosen_words = [x for x in clean_lem_counter_2 if clean_lem_counter_2[x] > thresh_2]
len(chosen_words)

3255

In [194]:
# save words
with open('data/chosen_words.txt', 'w', encoding='utf-8') as f:
    for w in chosen_words:
        f.write(w + '\n')

* * *