In [4]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors
import nltk
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn import preprocessing as pre
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Todd
[nltk_data]     Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from IPython.display import clear_output

def do_stuff(embeds_path, df_path):

    df = pd.read_csv(df_path, sep=';')
    sentences = list(map(nltk.word_tokenize, map(lambda x: x.lower(), df['text_content'].values)))
    model = KeyedVectors.load(embeds_path)


    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit([[d] for d in df['category']])
    y = enc.transform([[d] for d in df['category']]).toarray()



    dct = Dictionary(sentences)  # fit dictionary

    corpus = [dct.doc2bow(line) for line in sentences]  # convert corpus to BoW format

    model_tfidf = TfidfModel(corpus, dictionary=dct)  # fit model


    X = []

    for i in range(len(sentences)):
        words = sentences[i]

        word_ids = list(map(lambda x: dct.token2id[x], words))

        weights_vector = dict(model_tfidf[corpus[i]])

        words_weights = np.array(list(map(lambda x: weights_vector[x], word_ids)))

        words_weights = pre.MinMaxScaler().fit_transform(words_weights.reshape(-1, 1))

        words_embeds = list(map(lambda x: model[x] if x in model else np.zeros_like(model[0]), words))

        words_embeds = words_embeds * words_weights.reshape(-1, 1)

        doc_embed = np.average(words_embeds, axis = 0)
        X.append(doc_embed )

    X = np.array(X)



    reports = []

    kf = KFold(n_splits=5)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        clf_kfold = MLPClassifier(random_state=1, max_iter=300).fit(X[train_index], y[train_index])
        reports.append(classification_report(
            y[test_index], clf_kfold.predict(X[test_index]), 
            target_names=enc.categories_[0], output_dict = True))
        

    total_result = dict()

    for k in reports[0].keys():
        vals = [dc[k] for dc in reports]

        metrics_dict = dict()
        for k2 in vals[0].keys():
            vals2 = [dc[k2] for dc in vals]
            avg_val = (sum(vals2) * 1.) / len(vals2)
            metrics_dict[k2] = avg_val

        total_result[k] = metrics_dict

    clear_output(wait=True)

    return pd.DataFrame(total_result).transpose()

In [6]:
out = do_stuff('data/vectors_cbow_all.kv',  'data/final_recipes.csv')
out.to_csv('outs/tfidf_cbow_all.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.702012,0.761103,0.71808,771.4
ciasteczka,0.709997,0.610316,0.644631,456.0
desery,0.644876,0.528062,0.558148,788.8
grill,0.614171,0.433705,0.502153,147.8
inne,0.221053,0.010519,0.018713,97.8
napoje i koktajle,0.774143,0.670706,0.715064,137.4
obiady,0.74129,0.678706,0.707633,1137.8
pieczywo,0.272222,0.098804,0.142857,34.0
przekąski,0.516865,0.262162,0.345961,480.4
przetwory,0.747466,0.691239,0.715751,234.4


In [7]:
out = do_stuff('data/vectors_cbow_subject.kv',  'data/final_recipes.csv')
out.to_csv('outs/tfidf_cbow_subject.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.713704,0.763193,0.726659,771.4
ciasteczka,0.707732,0.613223,0.640547,456.0
desery,0.652005,0.535649,0.568646,788.8
grill,0.562499,0.441074,0.488682,147.8
inne,0.15,0.002632,0.005172,97.8
napoje i koktajle,0.773775,0.680443,0.716333,137.4
obiady,0.73006,0.693238,0.710524,1137.8
pieczywo,0.254054,0.127432,0.15574,34.0
przekąski,0.478586,0.273598,0.346006,480.4
przetwory,0.727491,0.650664,0.686029,234.4


In [8]:
out = do_stuff('data/vectors_skipgram_all.kv',  'data/final_recipes.csv')
out.to_csv('outs/tfidf_skipgram_all.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.707271,0.745329,0.707246,771.4
ciasteczka,0.735225,0.596335,0.646636,456.0
desery,0.65483,0.538705,0.564571,788.8
grill,0.603853,0.437081,0.501566,147.8
inne,0.0,0.0,0.0,97.8
napoje i koktajle,0.809149,0.647023,0.715837,137.4
obiady,0.742122,0.679648,0.708595,1137.8
pieczywo,0.256322,0.098724,0.128894,34.0
przekąski,0.546502,0.194489,0.283119,480.4
przetwory,0.7644,0.687046,0.722803,234.4


In [9]:
out = do_stuff('data/vectors_skipgram_subject.kv',  'data/final_recipes.csv')
out.to_csv('outs/tfidf_skipgram_subject.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.717638,0.752302,0.718465,771.4
ciasteczka,0.727536,0.619434,0.657492,456.0
desery,0.665136,0.546897,0.575354,788.8
grill,0.596506,0.447148,0.505705,147.8
inne,0.0,0.0,0.0,97.8
napoje i koktajle,0.804449,0.652364,0.713547,137.4
obiady,0.751945,0.677054,0.71038,1137.8
pieczywo,0.25697,0.114514,0.141096,34.0
przekąski,0.583133,0.189621,0.281786,480.4
przetwory,0.764822,0.686346,0.722392,234.4
