# Trying with pre-trained Word2Vec models

In [80]:
import os
os.chdir("/Users/billydodds/Documents/Projects/Transactions_ML")

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore



from components.scripts.load_data import load_data
from components.scripts.process_data import get_corpora, run_fold, scale
from components.scripts.process_data import NLP_distances



import multiprocess as multiprocessing # normal multiprocessing doesn't work in Jupyter for some reason

# from gensim.models import Word2Vec

# import random

# import itertools

In [65]:
path = "./components/private_files/"

# Load in data
data = load_data(path)
# data = data.drop(["description", "date", "desc_corpus"], axis=1)
data



Unnamed: 0,amount,description,category,date,weekday,desc_corpus,desc_features
0,50.0,direct credit 002962 citigroup ptyltd paddy walsh,wages,2020-07-27,0,direct direc dire dir credit credi cred cre ci...,direct credit citigroup ptyltd paddy walsh
1,-4.8,mcdonalds gladesville gladesville nswau,food,2020-07-27,0,mcdonalds mcdonald mcdonal mcdona mcdon mcdo m...,mcdonalds gladesville
2,-4.0,mcdonalds gladesville gladesville nswau,food,2020-07-27,0,mcdonalds mcdonald mcdonal mcdona mcdon mcdo m...,mcdonalds gladesville
3,50.0,transfer from margaret a jorgensen izzy tutoring,wages,2020-07-26,6,transfer transfe transf trans tran tra margare...,transfer margaret jorgensen izzy tutoring
4,-14.5,chargrill mosman mosman au,food,2020-07-25,5,chargrill chargril chargri chargr charg char c...,chargrill mosman
...,...,...,...,...,...,...,...
595,-36.0,liquorland 3638 glebe au aus card xx6725 value...,beers,2019-09-11,2,liquorland liquorlan liquorla liquorl liquor l...,liquorland glebe
596,-42.2,uberuae_eats sydney au aus card xx6725 value d...,food,2019-09-09,0,uberuae uberua uberu uber ube eats eat sydney ...,uberuae eats sydney
597,-13.0,eastern suburbs dist bellevue hill ns aus card...,beers,2019-09-06,4,eastern easter easte east eas suburbs suburb s...,eastern suburbs dist bellevue hill
598,19.0,return uberuae_eats sydney au aus card xx6725 ...,food,2019-09-09,0,return retur retu ret uberuae uberua uberu ube...,return uberuae eats sydney


In [10]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec') # , limit=999999

# model = FastText.load_fasttext_format('wiki-news-300d-1M.vec')

print(model.most_similar('teacher'))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]

print(model.similarity('teacher', 'teaches'))
# Output = 0.683924396754

[('teachers', 0.7878414988517761), ('educator', 0.7609370946884155), ('Teacher', 0.7341679930686951), ('student', 0.7071880102157593), ('pupil', 0.7035911083221436), ('schoolteacher', 0.7021132111549377), ('school', 0.6894515752792358), ('classroom', 0.6704409122467041), ('instructor', 0.6594052910804749), ('professor', 0.6570422649383545)]
0.3971077


In [19]:
model.similarity('transfer', 'food')

0.30142558

In [81]:

def get_similarity(corpus, desc):
    if len(corpus) == 0:
        return 100.0
    min_distances = []
    for feature in desc.split(" "):
        min_dist = np.inf
        for corp in corpus:                 # Find the minimum distance that each feature has to any corpus item (best possible match)
            try:
                dist = model.similarity(feature, corp)
            except:
                print(f"Something happened: feature={feature}, corpus word={corp}")
                pass
            if dist == 0.0:
                min_dist = dist
                break
            if dist < min_dist:
                min_dist = dist
        min_distances.append(min_dist)
    return np.mean(min_distances)


In [23]:
model.similarity('life', 'transport')

0.40054637

In [33]:
for cat in data.category.unique():
    data[cat+"_word2vec_dist"] = [get_similarity(words, cat) for words in data.desc_features]
    
data



Unnamed: 0,amount,category,weekday,desc_features,wages_word2vec_dist,food_word2vec_dist,transfer_word2vec_dist,life/wellbeing_word2vec_dist,shopping_word2vec_dist,transport_word2vec_dist,beers_word2vec_dist
0,50.0,wages,0,direct credit citigroup ptyltd paddy walsh,0.287817,0.291161,0.274149,0.191568,0.269021,0.273310,0.240262
1,-4.8,food,0,mcdonalds gladesville,0.383430,0.415638,0.106511,0.242880,0.300836,0.167045,0.339243
2,-4.0,food,0,mcdonalds gladesville,0.383430,0.415638,0.106511,0.242880,0.300836,0.167045,0.339243
3,50.0,wages,6,transfer margaret jorgensen izzy tutoring,0.282394,0.284726,0.390946,0.225050,0.298715,0.290110,0.236897
4,-14.5,food,5,chargrill mosman,0.229762,0.337490,0.232264,0.198551,0.210881,0.195119,0.339621
...,...,...,...,...,...,...,...,...,...,...,...
595,-36.0,beers,2,liquorland glebe,0.250473,0.191607,0.183644,0.223171,0.141857,0.127702,0.165788
596,-42.2,food,0,uberuae eats sydney,0.262487,0.361457,0.124382,0.264266,0.276971,0.231161,0.279078
597,-13.0,beers,4,eastern suburbs dist bellevue hill,0.316791,0.299100,0.234319,0.278599,0.313198,0.264674,0.295740
598,19.0,food,0,return uberuae eats sydney,0.291466,0.345420,0.267226,0.276531,0.287215,0.300569,0.264575


In [82]:
def run_fold_word2vec(X:pd.DataFrame, split, model, web_scrape=False, lookup=False, verbose=True, min_freq=0) -> float:
    
    
    train_index, test_index = split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = X_train.category, X_test.category
    

    corpus = get_corpora(X_train, min_freq=min_freq)
    X_train = NLP_distances(X_train.drop('category', axis=1), corpus, desc_dist=get_similarity)
    X_test = NLP_distances(X_test.drop('category', axis=1), corpus, get_similarity)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scale(X_train, scaler)
    X_test = scale(X_test, scaler)
    
    
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_test)
    predictions = model_predictions

    print("predictions: ", predictions) if verbose else None
    print("actual: ", np.array(y_test)) if verbose else None
    print("matches: ", predictions == np.array(y_test)) if verbose else None

    acc = sum(predictions == y_test)/len(predictions)

    if verbose and (web_scrape or lookup):
        print(f"corrections: {sum(corrections)} of {len(corrections)}")
        print(f"Baseline (just lookup without ML model): {sum(baseline)/len(baseline)}")
    
    print(acc) if verbose else None
    return acc

In [83]:
splitter = StratifiedKFold(n_splits=10, shuffle=True)
folds = list(splitter.split(data.drop("category", axis=1), data.category))
accuracies = []
def record(acc):
    accuracies.append(acc)

# pool = multiprocessing.Pool(min(multiprocessing.cpu_count(), 10))
# for fold in folds:
#     pool.apply_async(run_fold_word2vec, args=(data, fold, DecisionTreeClassifier, False, False, True), callback = record)
# pool.close()
# pool.join()

run_fold_word2vec(data.drop("desc_features", axis=1), folds[1], DecisionTreeClassifier(), False, False, True)



TypeError: NLP_distances() got an unexpected keyword argument 'desc_dist'