In [24]:
import os
import glob
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
from joblib import dump
from scipy.sparse import save_npz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import WordNetLemmatizer

In [5]:
PATH_DATA = './data/GPComments'

In [9]:
search_path = os.path.join(PATH_DATA, '*.json')
data_files_list = glob.glob(search_path)
data_frame_list = []
for path in tqdm(data_files_list):
    df = pd.read_json(path, orient='records')
    data_frame_list.append(df['text', 'score'])

df = pd.concat(data_frame_list, axis=0)
df.head()

100%|██████████| 250/250 [01:01<00:00,  4.03it/s]


Unnamed: 0,title,text,score
0,,Tik tok is the best app ever,5
1,,I love it,5
2,,Nice,5
3,,GREAT,5
4,,Good,3


In [17]:
print(df['text'].isna().sum())

0
24993888


In [16]:
df.dropna(axis=0, how='all', subset='text', inplace=True)
print(df['text'].isna().sum())

In [19]:
df['labels'] = (df['score'] >= 3).astype(np.int32).values
df.tail()

Unnamed: 0,text,score,labels
99995,Does not load at all,1,0
99996,Its fun,5,1
99997,I love I love it,5,1
99998,It won't let me play it,1,0
99999,Its good,5,1


In [20]:
data_version = 0

In [23]:
class LemmaTokenizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        pattern_token = r'(?u)(\b[a-z]{2,}\b|[\u263a-\U0001f645]|!|\?)'
        self.__regex_token = re.compile(pattern_token)

    def __call__(self, sentence):
        tokens = []
        for match in self.__regex_token.finditer(sentence):
            start, end = match.start(), match.end()
            token_text = sentence[start: end]
            token_text_lemmatized = self.lemmatizer.lemmatize(token_text)
            tokens.append(token_text_lemmatized)

        return tokens

In [25]:
tokenizer = LemmaTokenizer()
stop_words_lematized = [tokenizer.lemmatizer.lemmatize(word) for word in ENGLISH_STOP_WORDS]
stop_words_lematized.append('shall')
vectorizer = CountVectorizer(
                            tokenizer=tokenizer,
                            strip_accents='unicode',
                            lowercase=True,
                            stop_words=stop_words_lematized,
                            ngram_range=(1, 2),
                            min_df=1000,
                            binary=True
)

In [26]:
text_vectorized = vectorizer.fit_transform(df['text'])

In [27]:
text_vectorized.shape

(24993888, 19961)

In [28]:
type(text_vectorized)

scipy.sparse.csr.csr_matrix

In [29]:
vectorizer_params = {
    'vocabulary_': vectorizer.vocabulary_,
}
dump(vectorizer_params, f'./data/gp_vectorized/data_headers_{data_version}.sav')

['./data/gp_vectorized/data_headers_0.sav']

In [30]:
save_npz(f'./data/gp_vectorized/data_vectorized_{data_version}.npz', text_vectorized)

In [31]:
df.to_csv('./data/gp_vectorized/labels.csv', columns=['labels'])