In [43]:
import pandas as pd
import re
import os

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.utils import pad_sequences
from collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers = 8)

nltk.download('stopwords')
nltk.download('wordnet')


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/argen7um/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/argen7um/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
def get_text_and_score(pathes):
    text_score = []
    for path in pathes:
        file_names = os.listdir(path)
        for file_name in file_names:
            with open(f'{path}/{file_name}', 'r') as file:
                text = file.read()
                score = int(file_name.split('.')[0].split('_')[-1])
            text_score.append([text, score])
    return text_score

In [45]:
path = r'../data/aclImdb'
corpus = get_text_and_score([f'{path}/train/pos/', f'{path}/train/neg/', 
                             f'{path}/test/pos/', f'{path}/test/neg/'])

In [46]:
dataset = pd.DataFrame(corpus, columns=('data', 'target'))

texts_count = 50000
dataset = dataset.iloc[:texts_count]

In [47]:
def clean_regex(text):
    text = re.sub('\"{2,3}', '', text)
    text = re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)
    text = re.sub(r'[.,!?-]', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    
    return text

In [48]:
stopwords = [word for word in set(stopwords.words('english')) if word not in ('no', 'not')]
lemmatizer = WordNetLemmatizer()

In [49]:
max_lenth = 1_000

In [50]:
def clean_regex(text):
    text = re.sub('\"{2,3}', '', text)
    text = re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)
    text = re.sub(r'[.,!?-]', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    
    return text

def tokenize_text(text):
    return word_tokenize(text)

def split_words_by_space(text):
    return text.split()

def make_lower(word_list):
    return list(map(lambda t: t.lower(), word_list))

def lemmatize(word_list):
    return list(map(lemmatizer.lemmatize, word_list))

def remove_stop_words1(word_list):
    text = join_word_list(word_list)
    text = re.sub(rex_to_rem_stopwords, ' ', text)
    return split_words_by_space(text)


def add_bos_tag(word_list):
    word_list.insert(0, '<bos>')
    return word_list

def limit_sequence(word_list, max_lenth):
    return word_list[:max_lenth]

def join_word_list(word_list):
    return ' '.join(word_list)

def preprocess_text(text):
    word_list = tokenize_text(text)
    lower_word_list = make_lower(word_list)
    lemmatized_word_list = make_lower(lower_word_list)
    word_list_with_bos = add_bos_tag(remove_stop_words1(lemmatized_word_list))
    preprocessed_text = join_word_list(limit_sequence(word_list_with_bos, 1000))
    
    return preprocessed_text

In [51]:
count_words = Counter(nltk.flatten(list(dataset['data'][:].parallel_apply(lambda a: lemmatize(make_lower(tokenize_text(clean_regex(a))))))))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

In [52]:
threshold_word = 5

In [53]:
stopwords = list(set(stopwords).union(set(word for word in count_words if count_words[word] >= threshold_word)))

In [54]:
rex_to_rem_stopwords = '|'.join(f'\s{word}\s' for word in stopwords)

In [58]:
dataset['cleaned text'] = dataset['data'].parallel_apply(clean_regex)
dataset['preprocessed text'] = dataset['cleaned text'].parallel_apply(preprocess_text)
dataset['preprocessed text']

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

0        <bos> this is great story s everything slavery...
1        <bos> a years a got one his friends video the ...
2        <bos> i this a but s fact you not it you from ...
3        <bos> a movie family sacrifice and macy such g...
4        <bos> one the merrie melodies cartoons enterta...
                               ...                        
49995    <bos> if are bit and to some you try one waste...
49996    <bos> r is of only films ve where can say simp...
49997    <bos> ruggero is credited inventing cannibal w...
49998    <bos> stick is of brief of films paired ray ol...
49999    <bos> i t i tried watch filth an bhorror fan w...
Name: preprocessed text, Length: 50000, dtype: object

In [55]:
from nltk.util import ngrams

In [56]:
ngrams(dataset["processed text"])

KeyError: 'processed text'

In [57]:
dataset

Unnamed: 0,data,target
0,This show is a great history story. It's has e...,10
1,"A few years ago, a friend got from one of his ...",10
2,"I like this movie a lot, but it's a fact, that...",10
3,"A brilliant movie about family, guilt, sacrifi...",9
4,One of the many Merrie Melodies cartoons that ...,7
...,...,...
49995,If you are a bit masochistic and like to waste...,1
49996,'R Xmas is one of the only films I've seen whe...,4
49997,Ruggero Deodato is often credited for inventin...,4
49998,"""Stick Around"" is one of the brief series of f...",4
