In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors
import nltk
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Todd
[nltk_data]     Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from IPython.display import clear_output

def do_stuff(embeds_path, df_path):

    df = pd.read_csv(df_path, sep=';')
    sentences = list(map(nltk.word_tokenize, map(lambda x: x.lower(), df['text_content'].values)))
    # sentences[6][5:10]
    model = KeyedVectors.load(embeds_path)

    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit([[d] for d in df['category']])
    y = enc.transform([[d] for d in df['category']]).toarray()

    X = []

    for i in range(len(sentences)):
        words = sentences[i]

        words_embeds = list(map(lambda x: model[x] if x in model else np.zeros_like(model[0]), words))

        doc_embed = np.average(np.array(words_embeds), axis = 0)
        X.append(doc_embed )

    X = np.array(X)

    reports = []

    kf = KFold(n_splits=5)

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        clf_kfold = MLPClassifier(random_state=1, max_iter=300).fit(X[train_index], y[train_index])
        reports.append(classification_report(
            y[test_index], clf_kfold.predict(X[test_index]), 
            target_names=enc.categories_[0], output_dict = True))
        
    total_result = dict()

    for k in reports[0].keys():
        vals = [dc[k] for dc in reports]

        metrics_dict = dict()
        for k2 in vals[0].keys():
            vals2 = [dc[k2] for dc in vals]
            avg_val = (sum(vals2) * 1.) / len(vals2)
            metrics_dict[k2] = avg_val

        total_result[k] = metrics_dict

    clear_output(wait=True)
    
    return pd.DataFrame(total_result).transpose()


In [3]:
out = do_stuff('data/vectors_cbow_all.kv',  'data/final_recipes.csv')
out.to_csv('outs/cbow_all.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.687935,0.762193,0.716358,771.4
ciasteczka,0.689758,0.646608,0.649338,456.0
desery,0.668948,0.576279,0.593203,788.8
grill,0.612119,0.43413,0.501428,147.8
inne,0.021765,0.0062,0.009544,97.8
napoje i koktajle,0.810221,0.678403,0.73666,137.4
obiady,0.738713,0.70145,0.717657,1137.8
pieczywo,0.330952,0.104944,0.159356,34.0
przekąski,0.493066,0.328861,0.388433,480.4
przetwory,0.756796,0.712045,0.729823,234.4


In [4]:
out = do_stuff('data/vectors_cbow_subject.kv',  'data/final_recipes.csv')
out.to_csv('outs/cbow_subject.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.695972,0.787411,0.727873,771.4
ciasteczka,0.717968,0.653569,0.669768,456.0
desery,0.652248,0.574387,0.593041,788.8
grill,0.596016,0.512476,0.547712,147.8
inne,0.069874,0.024191,0.029514,97.8
napoje i koktajle,0.769512,0.676761,0.714166,137.4
obiady,0.72871,0.689979,0.708099,1137.8
pieczywo,0.261538,0.063397,0.101165,34.0
przekąski,0.490658,0.33936,0.399653,480.4
przetwory,0.737825,0.727732,0.731199,234.4


In [5]:
out = do_stuff('data/vectors_skipgram_all.kv',  'data/final_recipes.csv')
out.to_csv('outs/skipgram_all.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.7087,0.802133,0.736639,771.4
ciasteczka,0.750285,0.627318,0.668296,456.0
desery,0.684128,0.575743,0.599522,788.8
grill,0.635343,0.480448,0.539541,147.8
inne,0.0,0.0,0.0,97.8
napoje i koktajle,0.830079,0.686393,0.747089,137.4
obiady,0.756321,0.709635,0.731542,1137.8
pieczywo,0.29,0.114673,0.161828,34.0
przekąski,0.582919,0.24602,0.341196,480.4
przetwory,0.778066,0.721716,0.748192,234.4


In [6]:
out = do_stuff('data/vectors_skipgram_subject.kv',  'data/final_recipes.csv')
out.to_csv('outs/skipgram_subject.csv')
out

Unnamed: 0,precision,recall,f1-score,support
ciasta i torty,0.719686,0.793263,0.739459,771.4
ciasteczka,0.740549,0.640115,0.670301,456.0
desery,0.685484,0.559041,0.592038,788.8
grill,0.667276,0.516634,0.57868,147.8
inne,0.0,0.0,0.0,97.8
napoje i koktajle,0.815301,0.698432,0.748171,137.4
obiady,0.778628,0.714743,0.744923,1137.8
pieczywo,0.296904,0.107097,0.15409,34.0
przekąski,0.586911,0.292254,0.387434,480.4
przetwory,0.775538,0.722395,0.747767,234.4
