# Imports

In [8]:
import pandas as pd

from tqdm import tqdm
tqdm.pandas()
from os.path import join as path_join

from config import DATA_DIR

# Load data

In [9]:
data = pd.read_parquet(path_join(DATA_DIR, 'movie_reviews.parquet'))
print(data.shape)
data.sample(5, random_state=124)

(2000, 2)


Unnamed: 0,text,target
1769,men in black is an explosive mix of science fi...,1
1473,"a sci fi / comedy starring jack nicholson , pi...",1
1840,the question isn ' t why has grease been reiss...,1
1537,before even seeing a single frame of the film ...,1
1866,""" through a spyglass , i could see everything ...",1


In [10]:
data.target.value_counts()

1    1000
0    1000
Name: target, dtype: int64

# If all the revies are on english?

In [12]:
from langdetect import detect_langs as lang_detector
from langdetect.lang_detect_exception import LangDetectException

In [13]:
def detect_langs(text: str, default='en', top_rated: int=3, threshold: float=0.1) -> list:
    """
    Function returns 3 the most probable languages the text is written on.
    Probability of each languages have to beat threshold: 0.5 to be put in the returned list
    :param text: text to detect language it was written on
    :param default: default language to return if error was caught
    :param top_rated: how many languages to return
    :param threshold: threshold to filter the languages with smaller probability.
                      Have to be between values: [0.1, 1.0)
                      PS: If threshold is bigger than 0.5 it is logically that the function returns a list of one item.
    :return: list of languages. first is the most possible
    """
    try:
        preds = lang_detector(text.lower())
        if preds == 'unknown':
            return [default]
        preds = sorted(filter(lambda x: x.prob >= threshold, preds), reverse=True, key=lambda x: x.prob)[:top_rated]
        return [pred.lang for pred in preds]
    except LangDetectException:
        logging.debug('LangDetectException', exc_info=1)
        return [default]

In [18]:
langs = data.text.progress_apply(lambda x: detect_langs(x, default='unknown', top_rated=1)[0])
is_english = all(map(lambda x: x == 'en', langs))
print(f'Is all text on english language: {is_english}')
print(pd.Series(langs).value_counts())

Is all text on english language: True
en    2000
Name: text, dtype: int64


# Set of symbols

In [22]:
import re
from itertools import chain

In [28]:
symbols = set(chain(*data.text))
chars  = set()
spec   = set()
digits = set()
for sym in symbols:
    if sym.isalpha():
        chars.update(sym)
    elif sym.isdigit():
        digits.update(sym)
    else:
        spec.update(sym)
print(f'Chars  : {sorted(chars)}')
print(f'Digits : {sorted(digits)}')
print(f'Spec   : {sorted(spec)}')

Chars  : ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Digits : ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Spec   : ['\x05', '\x12', '\x13', '\x14', '\x16', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


# What phrases are there

In [29]:
import re

In [56]:
print(re.search(r"(https?\://)\S+", plain_text))
print(re.search(r"(www\://)\S+", plain_text))
print(re.search(r"@[^:| ]+:? ?", plain_text))
print(re.search(r"#+ ?", plain_text))
print(re.search(r'[-+]?\d+(\.[0-9]*)?', plain_text))

None
None
<re.Match object; span=(645863, 645868), match='@$&% '>
<re.Match object; span=(60007, 60009), match='# '>
<re.Match object; span=(2309, 2311), match='20'>


# Create positive/negative

In [25]:
import os
import re
from config import DATA_DIR

from typing import Tuple, List

In [19]:
def get_dictionaries() -> Tuple[List[str], List[str]]:
    with open(os.path.join(DATA_DIR, 'word_dictionaries/negative.txt'), 'r') as file:
        negative = [s.strip().lower() for s in file.readlines()]
    with open(os.path.join(DATA_DIR, 'word_dictionaries/positive.txt'), 'r') as file:
        positive = [s.strip().lower() for s in file.readlines()]
    return negative, positive


NEGATIVE, POSITIVE = get_dictionaries()
plain_text = ' '.join(data['text'])

In [20]:
def update_dictionary(dictionary, text):
    new_dict = {word for word in dictionary if word in text}
    len_new_dict = len(new_dict)
    print(f'New dict include {len_new_dict} words ({len_new_dict / len(dictionary)*100:.2f}%)')
    return new_dict

NEGATIVE = update_dictionary(NEGATIVE, plain_text)
POSITIVE = update_dictionary(POSITIVE, plain_text)

New dict include 3387 words (70.81%)
New dict include 1527 words (76.12%)


In [24]:
# Count of negative positive words
d = {f'not {word}' for word in POSITIVE if f'not {word}' in plain_text}
print(f'Negative postive words: {len(d)} ({len(d) / len(POSITIVE)*100:.2f}%)')

Negative postive words: 140 (9.17%)


In [31]:
NEGATIVE.update(d)

In [37]:
# Do not like pattern
len(re.findall(r"(do not)|(don't)|(does not)|(doesn't)|(didn't)|(did not)|(has not)|(hasn't)|(have not)|(haven't) like", plain_text))

711

In [33]:
def write_dictionaries(positive, negative):
    with open(os.path.join(DATA_DIR, 'word_dictionaries/updated_negative.txt'), 'w') as file:
        file.write('\n'.join(negative))
    with open(os.path.join(DATA_DIR, 'word_dictionaries/updated_positive.txt'), 'w') as file:
        file.write('\n'.join(positive))
write_dictionaries(positive=POSITIVE, negative=NEGATIVE)

# Get biger dataset

In [13]:
import os
from tqdm import tqdm
import pandas as pd

from config import DATA_DIR

def get_texts(path, label, verbose=False):
    texts = []
    for file in tqdm(os.listdir(path), desc='Read data'):
        with open(os.path.join(path, file), 'r') as file:
            texts.append(file.read())
    if verbose:
        print(f'Data shape: {len(texts)}')
    return pd.DataFrame({'text': texts, 'target': label})

positive = get_texts(os.path.join(DATA_DIR, 'aclImdb/train/pos'), label=1, verbose=False)
negative = get_texts(os.path.join(DATA_DIR, 'aclImdb/train/neg'), label=0, verbose=False)
pos_test = get_texts(os.path.join(DATA_DIR, 'aclImdb/test/pos'), label=1, verbose=False)
neg_test = get_texts(os.path.join(DATA_DIR, 'aclImdb/test/neg'), label=0, verbose=False)

big_data = pd.concat([positive, negative, pos_test, neg_test], axis=0)
print(big_data.shape)
big_data.sample(4)

Read data: 100%|██████████| 12500/12500 [00:02<00:00, 4550.08it/s]
Read data: 100%|██████████| 12500/12500 [00:00<00:00, 15801.96it/s]
Read data: 100%|██████████| 12500/12500 [00:00<00:00, 39623.39it/s]
Read data: 100%|██████████| 12500/12500 [00:00<00:00, 42607.41it/s]


(50000, 2)


Unnamed: 0,text,target
2659,"Comparable to Fight Club, The Matrix, A.I., Si...",1
5056,Bloodsuckers has the potential to be a somewha...,0
3337,This is another fantasy favorite from Ralph Ba...,1
1569,The story deals about Jet Li who has to fight ...,1


### We need remove from this dataset data that are on the test data

In [14]:
from preprocessing import Tokenizer
tqdm.pandas()

In [29]:
tokenizer = Tokenizer(stem = 'lem', splitter = None, remove_spec = True)

data['text'] = data.text.str.join(' ')
data['text'] = data.text.progress_apply(tokenizer.tokenize)
big_data['text'] = big_data.text.progress_apply(tokenizer.tokenize)



  0%|          | 0/2000 [00:00<?, ?it/s][A[A

  1%|▏         | 28/2000 [00:00<00:07, 275.52it/s][A[A

  3%|▎         | 67/2000 [00:00<00:06, 301.55it/s][A[A

  5%|▍         | 99/2000 [00:00<00:06, 306.40it/s][A[A

  7%|▋         | 135/2000 [00:00<00:05, 320.47it/s][A[A

  9%|▊         | 172/2000 [00:00<00:05, 332.16it/s][A[A

 10%|█         | 209/2000 [00:00<00:05, 340.31it/s][A[A

 12%|█▏        | 247/2000 [00:00<00:05, 349.37it/s][A[A

 14%|█▍        | 281/2000 [00:00<00:04, 345.76it/s][A[A

 16%|█▌        | 318/2000 [00:00<00:04, 351.28it/s][A[A

 18%|█▊        | 358/2000 [00:01<00:04, 361.81it/s][A[A

 20%|█▉        | 399/2000 [00:01<00:04, 374.49it/s][A[A

 22%|██▏       | 437/2000 [00:01<00:04, 355.35it/s][A[A

 24%|██▍       | 476/2000 [00:01<00:04, 364.65it/s][A[A

 26%|██▌       | 513/2000 [00:01<00:04, 356.87it/s][A[A

 27%|██▋       | 549/2000 [00:01<00:04, 353.48it/s][A[A

 29%|██▉       | 585/2000 [00:01<00:03, 354.50it/s][A[A

 31%|███ 

In [54]:
from sklearn.externals.joblib import Parallel, delayed
from multiprocessing import cpu_count
import numpy as np

BIG_DATA_TEXTS = big_data.text
    
def find_similar(target_text, threshold=0.8):
    similar = []
    target_units = set(target_text)
    for ix, sample in enumerate(BIG_DATA_TEXTS):
        units = set(sample)
        if len(units & target_units) / len(units | target_units) > threshold:
            similar.append(ix)
    return similar

def apply_func_to_batch(batch,func, axis=0,**kwds):
    if axis:
        return batch.apply(func,axis=axis, **kwds)
    return batch.apply(func,**kwds)

def progress_apply_multiprocessing_series(self, func, n_jobs=None ,batch_on_core=1 ,verbose = False ,
                                          leave=True , **kwds):
    if not n_jobs:
        n_jobs = cpu_count()
    
    values = np.array_split(self, n_jobs*batch_on_core)
    if verbose:
        values = tqdm(values,leave=leave)
        
    parallel = Parallel(n_jobs, backend='multiprocessing')
    
    tqdm.pandas(leave = False)
    result = parallel(delayed(apply_func_to_batch)(batch,func,**kwds) for batch in values)
    tqdm.pandas(leave = True)
    
    return pd.concat(result)

pd.Series.apply_multiproces = progress_apply_multiprocessing_series
similars = data.text.apply_multiproces(find_similar, verbose=True)









  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







 25%|██▌       | 1/4 [00:00<00:01,  2.89it/s][A[A[A[A[A[A[A[A







100%|██████████| 4/4 [00:00<00:00,  9.19it/s][A[A[A[A[A[A[A[A

In [56]:
similars.apply(bool).sum()

0

In [59]:
big_data['text'] = big_data.text.str.join(' ')
big_data.to_parquet(os.path.join(DATA_DIR, 'big_data.parquet'))