In [1]:
import os
import glob

from tqdm import tqdm
import numpy as np
import pandas as pd
from joblib import dump
from scipy.sparse import save_npz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from utils.lemma_tokenizer import LemmaTokenizer

In [3]:
PATH_DATA = './data/GPCommentsRaw'

In [4]:
search_path = os.path.join(PATH_DATA, '*.json')
data_files_list = glob.glob(search_path)
data_frame_list = []
for path in tqdm(data_files_list):
    df = pd.read_json(path, orient='records')
    data_frame_list.append(df[['text', 'score']])

df = pd.concat(data_frame_list, axis=0)
df.head()

100%|██████████| 250/250 [01:13<00:00,  3.39it/s]


Unnamed: 0,text,score
0,Tik tok is the best app ever,5
1,I love it,5
2,Nice,5
3,GREAT,5
4,Good,3


In [6]:
print(df['text'].isna().sum())

6112
6112


In [8]:
df.dropna(axis=0, how='all', subset='text', inplace=True)
print(df['text'].isna().sum())

0


In [6]:
df['labels'] = (df['score'] >= 3).astype(np.int32).values
df.tail()

Unnamed: 0,text,score,labels
99995,Does not load at all,1,0
99996,Its fun,5,1
99997,I love I love it,5,1
99998,It won't let me play it,1,0
99999,Its good,5,1


In [7]:
data_version = 0

In [8]:
pattern_token = r'(?u)(\b[a-z]{2,}\b|[\u263a-\U0001f645]|!|\?)'
tokenizer = LemmaTokenizer(pattern_token)
stop_words_lematized = [tokenizer.lemmatizer.lemmatize(word) for word in ENGLISH_STOP_WORDS]
stop_words_lematized.append('shall')
vectorizer = CountVectorizer(
                            tokenizer=tokenizer,
                            strip_accents='unicode',
                            lowercase=True,
                            stop_words=stop_words_lematized,
                            ngram_range=(1, 2),
                            min_df=10_000,
                            binary=True
)

In [9]:
text_vectorized = vectorizer.fit_transform(df['text'])

In [11]:
mask = text_vectorized.getnnz(axis=1) > 1
text_vectorized_cleared = text_vectorized[mask]
df_cleared = df[mask]
df_cleared.shape

(18344243, 3)

(18344243, 3)

In [12]:
text_vectorized.shape, text_vectorized_cleared.shape

((24993888, 2511), (18344243, 2511))

In [13]:
df_cleared.to_csv('./data/gp_comments/targets.csv', columns=['score', 'labels'])

In [14]:
vectorizer_params = {
    'vocabulary_': vectorizer.vocabulary_,
}
dump(vectorizer_params, f'./data/gp_comments/data_headers_{data_version}.sav')

['./data/gp_comments/data_headers_0.sav']

In [15]:
save_npz(f'./data/gp_comments/data_vectorized_{data_version}.npz', text_vectorized_cleared)