In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sys import getsizeof
from time import time

def mb_size_str(obj: object, name: str):
    return f"{name} size is {round(getsizeof(obj) / pow(1024, 2), 2)}MB"

import gensim
import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

train_file = "train.csv"
train_df = pd.read_csv(train_file)
INPUT_DATASET_SIZE = train_df.shape[0]
b_time = time()

## Preproc

In [10]:
cfg = {
    'TF_IDF': True,
    'LEMMATIZE': False,
    'DO_STEMMING': False,
    'STOPLIST': True,
    'LOWFREQ_FILTER': True,
    'LOWFREQ_TRESHOLD': 50,
    'TEST_RATIO': 0.2, # ratio of train samples that will go as test ones
    'MODEL': "6 Gensim Continuous Skipgram",
}

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    train_df['text'].to_numpy(),
    train_df['answer'].to_numpy(),
    test_size=cfg['TEST_RATIO']
)

In [11]:
from tqdm import tqdm

# # # # # # # # # # # # # # # # # # # 
#           Preprocessing           #
# # # # # # # # # # # # # # # # # # # 

import src.preprocessing as prep

freq_dict = {}
token_amount_counter_1 = [0]
token_amount_counter_2 = [0]

print("Preprocessing train...")
for i in tqdm(range(len(X_train_raw))):
    X_train_raw[i] = prep.preprocess(
        X_train_raw[i], cfg, freq_dict, token_amount_counter_1
    )

print("Preprocessing test...")
for i in tqdm(range(len(X_test_raw))):
    X_test_raw[i] = prep.preprocess(
        X_test_raw[i], cfg, freq_dict, token_amount_counter_2
    )

Preprocessing train...


100%|██████████| 20000/20000 [00:09<00:00, 2083.91it/s]


Preprocessing test...


100%|██████████| 5000/5000 [00:02<00:00, 1967.21it/s]


In [12]:
if cfg['LOWFREQ_FILTER']:
    forms_to_remove = set(w for (w, freq) in freq_dict.items() if freq <= cfg['LOWFREQ_TRESHOLD'])
    new_token_amount_counter = [0]
    print(f"Removing low freq tokens (freq <= {cfg['LOWFREQ_TRESHOLD']})...")
    print(f"Total forms: {len(freq_dict.items())}")
    print(f"Total forms to be removed {len(forms_to_remove)}")
    for i in tqdm(range(len(X_train_raw))):
        X_train_raw[i] = prep.remove_lowfreq(X_train_raw[i], forms_to_remove, new_token_amount_counter)
    shrink_percent = round(100 * (token_amount_counter_1[0] - new_token_amount_counter[0]) / token_amount_counter_1[0], 2)
    print(f"Total token amount:\nold {token_amount_counter_1[0]}\nnew {new_token_amount_counter[0]}\n{shrink_percent}% total less")

    print(f"\n\nRemoving train's low freq tokens from test")
    new_token_amount_counter = [0]
    for i in tqdm(range(len(X_test_raw))):
        X_test_raw[i] = prep.remove_lowfreq(X_test_raw[i], forms_to_remove, new_token_amount_counter)
    print(f"Total token amount:\nold {token_amount_counter_2[0]}\nnew {new_token_amount_counter[0]}\n{shrink_percent}% total less")

Removing low freq tokens (freq <= 50)...
Total forms: 144612
Total forms to be removed 137870


100%|██████████| 20000/20000 [00:01<00:00, 18889.19it/s]


Total token amount:
old 2410691
new 1992046
17.37% total less


Removing train's low freq tokens from test


100%|██████████| 5000/5000 [00:00<00:00, 17701.47it/s]

Total token amount:
old 602770
new 497574
17.37% total less





### TfIdf

In [15]:
from src.tfidf import get_matrix
tfidf, matrix = get_matrix(X_train_raw) if cfg['TF_IDF'] else (None, None)

INFO : TF-IDF matrix...
INFO : Матрица на 20000 документов и 6542 термов


### Word Vectors

In [16]:
# LOAD THE MODEL
from nltk.tokenize import word_tokenize
w2v_file = f"{cfg['MODEL']}/model.bin"
model: gensim.models.keyedvectors.KeyedVectors = gensim.models.KeyedVectors.load_word2vec_format(w2v_file, binary=True)
VECTOR_SIZE = model.vector_size

INFO : loading projection weights from 6 Gensim Continuous Skipgram/model.bin
INFO : KeyedVectors lifecycle event {'msg': 'loaded (302866, 300) matrix of type float32 from 6 Gensim Continuous Skipgram/model.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-02-14T23:02:48.130037', 'gensim': '4.3.0', 'python': '3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]', 'platform': 'Linux-6.1.11-1-MANJARO-x86_64-with-glibc2.37', 'event': 'load_word2vec_format'}


In [17]:
from src.vectorizers import get_tfidf_vector, mean_w2v_vectors

print("Calculating mean review w2v vectors")
print("For train...")
X_train_w2v = mean_w2v_vectors(tqdm(X_train_raw), model)
print("For test...")
X_test_w2v = mean_w2v_vectors(tqdm(X_test_raw), model)

Calculating mean review w2v vectors
For train...


100%|██████████| 20000/20000 [00:26<00:00, 744.12it/s]


For test...


100%|██████████| 5000/5000 [00:06<00:00, 781.09it/s]


In [78]:
import src.vectorizers as vecs
print("Concatenating train's tfidf and w2v vectors for each text")
X_train_fullvec = vecs.concat_w2v(X_train_w2v, matrix)

print("Forming test's tfidf vectors")
tfidf_vecs = tfidf.transform(X_test_raw)
print("Concatenating test's tfidf and w2v vectors for each text")
X_test_fullvec = vecs.concat_w2v(X_test_w2v, tfidf_vecs)

Concatenating train's tfidf and w2v vectors for each text


100%|██████████| 20000/20000 [00:41<00:00, 476.96it/s]


Forming test's tfidf vectors
Concatenating test's tfidf and w2v vectors for each text


100%|██████████| 5000/5000 [00:09<00:00, 501.75it/s]


## Train

In [79]:
from sklearn.linear_model import LogisticRegression
from pprint import pprint

reg : LogisticRegression = LogisticRegression(max_iter=10_000).fit(X_train_fullvec, y_train)

diff = time() - b_time

results = {}
results.update(cfg)
print(f"""
    Score on
      - train  data : {reg.score(X_train_fullvec, y_train)}
      - test   data : {reg.score(X_test_fullvec, y_test)}
    Test/train size : {cfg['TEST_RATIO']}
    Total time      : {"{:.0f}m {:.0f}s".format(*divmod(diff, 60))}
""")


    Score on
      - train  data : 0.91635
      - test   data : 0.8786
    Test/train size : 0.2
    Total time      : 32m 13s



#  Testing

In [89]:
test_file = "test.csv"
test_df = pd.read_csv(test_file)
X_result_raw = train_df['text'].to_numpy()

token_amount_counter_3 = [0]

print("Preprocessing result...")
for i in tqdm(range(len(X_result_raw))):
    X_result_raw[i] = prep.preprocess(
        X_result_raw[i], cfg, freq_dict, token_amount_counter_1
    )

Preprocessing result...


100%|██████████| 25000/25000 [00:06<00:00, 3932.65it/s]


In [90]:
print(f"\n\nRemoving train's low freq tokens from result")
new_token_amount_counter = [0]
for i in tqdm(range(len(X_result_raw))):
    X_result_raw[i] = prep.remove_lowfreq(X_result_raw[i], forms_to_remove, new_token_amount_counter)
print(f"Total token amount:\nold {token_amount_counter_3[0]}\nnew {new_token_amount_counter[0]}\n{shrink_percent}% total less")



Removing train's low freq tokens from result


100%|██████████| 25000/25000 [00:01<00:00, 20756.76it/s]

Total token amount:
old 0
new 2489620
17.37% total less





In [91]:
print("Calculating mean review w2v vectors")
print("For result...")
X_result_w2v = mean_w2v_vectors(tqdm(X_result_raw), model)

Calculating mean review w2v vectors
For result...


100%|██████████| 25000/25000 [00:34<00:00, 715.84it/s]


In [92]:
print("Forming test's tfidf vectors")
tfidf_vecs = tfidf.transform(X_result_raw)
print("Concatenating test's tfidf and w2v vectors for each text")
X_test_fullvec = vecs.concat_w2v(X_result_w2v, tfidf_vecs)

Forming test's tfidf vectors
Concatenating test's tfidf and w2v vectors for each text


100%|██████████| 25000/25000 [00:53<00:00, 471.32it/s]
