In [1]:
import pandas as pd
import numpy as np
from random import random
from sys import getsizeof
from time import time

def mb_size_str(obj: object, name: str):
    return f"{name} size is {round(getsizeof(obj) / pow(1024, 2), 2)}MB"

import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as StopWords

stop_words = set(StopWords.words('english'))
lemmatizer = WordNetLemmatizer()

import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

train_file = "train.csv"
train_df = pd.read_csv(train_file)
TRAIN_DATASET_SIZE = train_df.shape[0]
b_time = time()

## Preproc

In [2]:
cfg = {
    'DO_TFIDF_WEIGHTING': False,
    'DO_LEMMATIZING': False,
    'DO_STEMMING': False,
    'DO_FILTER_STOPWORDS': True,
    'DO_FILTER_LOW_FREQ': True,
    'LOWFREQ_TRESHOLD': 3,
    'MODEL': "6 Gensim Continuous Skipgram",
}

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse._csr import csr_matrix
import re
from tqdm import tqdm

word_freq = {}
total_tokens = 0

def token_form_handlers(tokens: list[str]) -> None:
    """These are done in place"""
    if not (cfg['DO_LEMMATIZING'] or cfg['DO_STEMMING']): return

    for i in range(len(tokens)):
        if cfg['DO_LEMMATIZING']:
            tokens[i] = lemmatizer.lemmatize(tokens[i])
        if cfg['DO_STEMMING']: 
            result = wn.morphy(tokens[i])
            if result:
                tokens[i] = result

def stopword_filtering(tokens: list[str]) -> list[str]:
    if not cfg['DO_FILTER_STOPWORDS']: return tokens
    return [tkn for tkn in tokens if not tkn in stop_words]

def count_words(tokens: list[str]) -> None:
    global total_tokens
    for t in tokens:
        total_tokens += 1
        if t in word_freq: word_freq[t] += 1
        else: word_freq[t] = 1

def preproc(text: str) -> str:
    # regex removes all enclosed in <> (html tags) and any non letter non space characters
    # then collapses all space characters to a single space
    text = re.sub(f'\s+', ' ', re.sub(r'(<[^>]*>)|[^\w\s]', '', text.lower()))
    tokens = text.split(' ')
    token_form_handlers(tokens)
    tokens = stopword_filtering(tokens)
    count_words(tokens)
    return ' '.join(tokens)

# # # # # # # # # # # # # # # # # # # 
#           Preprocessing           #
# # # # # # # # # # # # # # # # # # # 
print("Preprocessing...")
for row_id in tqdm(range(TRAIN_DATASET_SIZE)):
    train_df.loc[row_id, "text"] = preproc(train_df.loc[row_id, "text"])

Preprocessing...


100%|██████████| 25000/25000 [00:27<00:00, 894.39it/s] 


In [4]:
forms_to_remove = set(w for (w, freq) in word_freq.items() if freq <= cfg['LOWFREQ_TRESHOLD'])

new_total_tokens = 0
def remove_tokens(text: str) -> str:
    global new_total_tokens
    new_tokens = [tkn for tkn in text.split() if not tkn in forms_to_remove]
    new_total_tokens += len(new_tokens)
    return ' '.join(new_tokens)

print(f"Removing low freq tokens (freq <= {cfg['LOWFREQ_TRESHOLD']})...")
print(f"Total forms: {len(word_freq.items())}")
print(f"Total forms to be removed {len(forms_to_remove)}")
texts = []
for row_id in tqdm(range(TRAIN_DATASET_SIZE)):
    train_df.loc[row_id, "text"] = remove_tokens(train_df.loc[row_id, "text"])
    texts.append(train_df.loc[row_id, "text"])
shrink_percent = round(100 * (total_tokens - new_total_tokens) / total_tokens, 2)
print(f"Total token amount:\n\told {total_tokens}\n\tnew {new_total_tokens}\n{shrink_percent}% total less")

Removing low freq tokens (freq <= 3)...
Total forms: 144613
Total forms to be removed 108752


100%|██████████| 25000/25000 [00:12<00:00, 2076.65it/s]

Total token amount:
	old 3014192
	new 2875149
4.61% total less





In [5]:
train_df.to_csv(f"train_cleaned.csv")

### TfIdf

In [6]:
if cfg['DO_TFIDF_WEIGHTING']:
    tfidf = TfidfVectorizer(stop_words="english")
    logger.info("TF-IDF matrix...")
    matrix: csr_matrix = tfidf.fit_transform(texts)
    print(f"Матрица на {matrix.shape[0]} документов и {matrix.shape[1]} термов")
else:
    print(f"DO_TFIDF_WEIGHTING = {cfg['DO_TFIDF_WEIGHTING']}")

DO_TFIDF_WEIGHTING = False


### Word Vectors

In [7]:
# LOAD THE MODEL
from nltk.tokenize import word_tokenize
w2v_file = f"{cfg['MODEL']}/model.bin"
model: gensim.models.keyedvectors.KeyedVectors = gensim.models.KeyedVectors.load_word2vec_format(w2v_file, binary=True)
VECTOR_SIZE = model.vector_size

INFO : loading projection weights from 6 Gensim Continuous Skipgram/model.bin
INFO : KeyedVectors lifecycle event {'msg': 'loaded (302866, 300) matrix of type float32 from 6 Gensim Continuous Skipgram/model.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-02-12T18:15:02.210760', 'gensim': '4.3.0', 'python': '3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]', 'platform': 'Linux-6.1.11-1-MANJARO-x86_64-with-glibc2.37', 'event': 'load_word2vec_format'}


In [8]:
def get_tfidf_vector(tokens: list[str], text_id: int) -> np.ndarray:
    tfidf_vector = np.ndarray(len(tokens))
    # a vector with all the terms
    text_raw_vector = matrix[text_id, :]
    # leaving only terms that are present in the text
    for i in range(len(tokens)):
        idx = tfidf.vocabulary_.get(tokens[i])
        val = matrix[text_id, idx] if idx else 0
        tfidf_vector[i] = val
    # vector with tfidf values of each word in the text
    return tfidf_vector

def text_vector(text: str, text_id: int) -> np.ndarray:
    """Compute the normalized weighted mean w2v vector for a given text"""

    tokens = [token for token in word_tokenize(text) if token in model]
    # Compute the word2vec vectors for each word in the text
    vectors = [model.get_vector(tkn) for tkn in tokens]
    if cfg['DO_TFIDF_WEIGHTING']:
        # Compute the tf-idf values for each word in the text.
        tfidf_vals = get_tfidf_vector(tokens, text_id)
        # Compute the weighted vectors by multiplying the word2vec vectors by the tf-idf values
        weighted_vecs = np.array([vec * factor for vec, factor in zip(vectors, tfidf_vals)])
    else:
        weighted_vecs = np.array(vectors)

    text_sum_vec = np.sum(weighted_vecs, axis=0)
    norm_vec = text_sum_vec / np.linalg.norm(text_sum_vec)
    
    return norm_vec

review_vectors = np.empty((TRAIN_DATASET_SIZE, VECTOR_SIZE), dtype=float)
print(review_vectors.shape)
print("Calculating mean review vectors...")
for row_id in tqdm(range(TRAIN_DATASET_SIZE)):
    review_vectors[row_id] = text_vector(train_df.loc[row_id, "text"], row_id)

(25000, 300)
Calculating mean review vectors...


100%|██████████| 25000/25000 [00:51<00:00, 488.15it/s]


In [9]:
# trying not to run out of memory on my potatoe (╥﹏╥)
print(getsizeof(review_vectors) / pow(1024, 2), "MB")
#model = None

57.2205810546875 MB


## Train

In [10]:
answers = train_df['answer'].to_numpy()
print(review_vectors.shape)
print(answers.shape)

(25000, 300)
(25000,)


In [11]:
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from datetime import timedelta as td
from datetime import datetime as dt

reg = LogisticRegression().fit(review_vectors, answers)

diff = time() - b_time

results = {}
results.update(cfg)
results["score"] = reg.score(review_vectors, answers)
results["time"] = "{:.0f}m {:.0f}s".format(*divmod(diff, 60))
pprint(results)
# 0.8396 - stopwords only

{'DO_FILTER_LOW_FREQ': True,
 'DO_FILTER_STOPWORDS': True,
 'DO_LEMMATIZING': False,
 'DO_STEMMING': False,
 'DO_TFIDF_WEIGHTING': False,
 'LOWFREQ_TRESHOLD': 3,
 'MODEL': '6 Gensim Continuous Skipgram',
 'score': 0.83968,
 'time': '1m 43s'}


#  Testing

In [12]:
def get_vector(text: str) -> np.ndarray:
    tokens = [token for token in word_tokenize(text) if token in model]
    # Compute the word2vec vectors for each word in the text
    vectors = np.array([model.get_vector(tkn) for tkn in tokens])
    sum_vector = np.sum(vectors, axis=0)
    norm_vector = sum_vector / np.linalg.norm(sum_vector)
    return norm_vector

test_file = "test.csv"
test_df = pd.read_csv(test_file)
TEST_DATASET_SIZE = test_df.shape[0]

# # # # # # # # # # # # # # # # # # # 
#             Processing            #
# # # # # # # # # # # # # # # # # # # 
print("Processing...")
result = test_df[["id"]].copy()
for row_id in tqdm(range(TEST_DATASET_SIZE)):
    text = test_df.loc[row_id, "text"]
    text = preproc(text)
    text = remove_tokens(text)
    test_df.loc[row_id, "text"] = text
    result.loc[row_id, "answer"] = int(reg.predict([get_vector(text)])[0])
test_df.to_csv(f"test_cleaned.csv", index=False)
result["answer"] = result["answer"].astype(int)
result.to_csv(f"result.csv", index=False)

Processing...


100%|██████████| 25000/25000 [01:54<00:00, 219.02it/s]
