In [1]:
import os
import pickle
import re
from string import punctuation
from stop_words import get_stop_words
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

import nltk

from gensim.models import Word2Vec, Doc2Vec, FastText
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR, SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')



In [2]:
display(os.listdir('saved_models/'))

exp_features = {
    'w2v_d2v_kwords_lemmatize': ['Kód projektu', 'Klíčová slova'],
    'd2v_w2v_kwords_nolemma': ['Kód projektu', 'Klíčová slova'],
    'ud_pipe_czech_text': ['Kód projektu', 'Název česky', 'Anotace česky'],
    'ud_pipe_nazev_anotace_stopwords': ['Kód projektu', 'Název česky', 'Anotace česky'],
    'tf_w2v_d2v_fast_eng_n_a_kwords_2': ['Kód projektu', 'Název anglicky', 'Anotace anglicky', 'Klíčová slova'],
    'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2': ['Kód projektu' ,'Název česky', 'Anotace česky', 'Hlavní CEP obor', 'Podrobné informace o účastnících', 'Hlavní řešitelé', 'Poskytovatel']
}

['w2v_d2v_kwords_lemmatize',
 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2',
 'ud_pipe_czech_text',
 'tf_w2v_d2v_fast_eng_n_a_kwords_2',
 'tf_w2v_d2v_fast_eng_n_a_kwords',
 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos',
 'ud_pipe_nazev_anotace_stopwords',
 'd2v_w2v_kwords_nolemma']

In [3]:
def load_data_for_exp(exp_name):
    orig_df = pd.read_csv('../data/TACR_Starfos_isvav_project.csv')
    df = orig_df[exp_features[exp_name]]
    df = df[~df.isna().any(axis=1)]
    orig_df = orig_df.iloc[df.index]
    df = df.reset_index(drop=True)
    df = df.reset_index().set_index('Kód projektu')
    orig_df = orig_df.reset_index(drop=True)

    results = pd.read_csv('../data/VaVaI_Projekty_s_vysledky.csv')
    results = results.set_index('Kód projektu')
    results = results[results.index.isin(df.index)]
    df = df.loc[results.index]
    df = df.set_index('index')
    orig_df = orig_df.loc[df.index]
    return orig_df, df.index, results

def load_vectors_for_exp(exp_name):
    root_path = os.path.join('saved_models', exp_name)
    with open(os.path.join(root_path, 'vectors.pickle'), 'rb') as handle:
        vectors = pickle.load(handle)
    return vectors

weights = {
    'Jx': 2,
    'Jimp': 9,
    'Jost': 7,
    'JSC': 7,
    'D': 4,
    'C': 3,
    'O': 1,
    'X': 1,
    'B': 6,
    'Vsouhrn': 2,
    'Vx': 2,
    'Vutaj': 2,
    'Gfunk': 3,
    'Gprot': 3,
    'A': 4,
    'Nmap': 4,
    'NmetC': 2,
    'NmetS': 2,
    'NmetA': 2,
    'Npam': 2,
    'Nlec': 2,
    'W': 2,
    'M': 2,
    'R': 4,
    'Fuzit': 3,
    'Fprum': 3,
    'Ztech': 4,
    'Zpolop': 4,
    'Zodru': 4,
    'Zx': 1,
    'Zplem': 2,
    'P': 8,
    'Enekrit': 2,
    'Ekrit': 3,
    'Hleg': 2,
    'Hneleg': 2,
    'Hkonc': 2,
    'Sdb': 4
}

def assign_weights(res):
    if len(res) == 0: 
        return 0
    sres = res.split(';')
    return sum([weights[r.strip()] for r in sres])

In [4]:
res = defaultdict(list)
for exp_name in exp_features.keys():
    orig_data, idxs, results = load_data_for_exp(exp_name)
    vectors = load_vectors_for_exp(exp_name)
    results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
    results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
    results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))
    
    for model_name, vector in vectors.items():
        X = vector[idxs]
        y = results['VahaEx'].to_numpy()

        if 'tf-idf' in model_name:
            scaler = StandardScaler(with_mean=False)
        else:
            scaler = StandardScaler()
        X = scaler.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = LinearRegression()
        model = model.fit(X_train, y_train)

        res['experiment_name'].append(exp_name)
        res['model_name'].append(model_name)
        res['score'].append(model.score(X_test, y_test))
        
results_df = pd.DataFrame(res)
results_df.to_csv('linear_regression_res_weighted.csv')
results_df

Unnamed: 0,experiment_name,model_name,score
0,w2v_d2v_kwords_lemmatize,"tf-idf_(1,1)",0.018325
1,w2v_d2v_kwords_lemmatize,"tf-idf_(1,2)",0.018325
2,w2v_d2v_kwords_lemmatize,"tf-idf_(1,3)",0.018325
3,w2v_d2v_kwords_lemmatize,"tf-idf_(1,4)",0.018325
4,w2v_d2v_kwords_lemmatize,"tf-idf_(1,5)",0.018325
...,...,...,...
330,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,doc2vec_100_50,0.058644
331,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,doc2vec_200_5,0.055208
332,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,doc2vec_200_10,0.059472
333,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,doc2vec_200_25,0.059508


In [5]:
results_df.nlargest(25, 'score')

Unnamed: 0,experiment_name,model_name,score
316,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_200_10,0.105001
315,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_200_5,0.10196
317,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_200_25,0.099298
318,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_200_50,0.09437
300,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,word2vec_200_10,0.089465
301,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,word2vec_200_25,0.089239
314,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_100_50,0.086098
311,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,fasttext_100_5,0.084292
299,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,word2vec_200_5,0.084077
302,tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_re...,word2vec_200_50,0.083465


In [8]:
results_df['score'].max()

0.10500050618608348

In [None]:
res = defaultdict(list)
for exp_name in exp_features.keys():
    orig_data, idxs, results = load_data_for_exp(exp_name)
    vectors = load_vectors_for_exp(exp_name)
    results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
    results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
    results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))
    
    for model_name, vector in vectors.items():
        X = vector[idxs]
        y = results['Vaha'].to_numpy()

        if 'tf-idf' in model_name:
            scaler = StandardScaler(with_mean=False)
        else:
            scaler = StandardScaler()
        X = scaler.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = MLPRegressor(hidden_layer_sizes=(128, 64), activation='tanh', random_state=42)
        model = model.fit(X_train, y_train)

        res['experiment_name'].append(exp_name)
        res['model_name'].append(model_name)
        res['score'].append(model.score(X_test, y_test))
        
        
results_df = pd.DataFrame(res)
results_df

In [6]:
results_df.nlargest(25, 'score')

Unnamed: 0,experiment_name,model_name,score
0,w2v_d2v_kwords_lemmatize,"tf-idf_(1,1)",0.033645
1,w2v_d2v_kwords_lemmatize,"tf-idf_(1,2)",0.033645
2,w2v_d2v_kwords_lemmatize,"tf-idf_(1,3)",0.033645
3,w2v_d2v_kwords_lemmatize,"tf-idf_(1,4)",0.033645
4,w2v_d2v_kwords_lemmatize,"tf-idf_(1,5)",0.033645
5,w2v_d2v_kwords_lemmatize,"tf-idf_(2,2)",0.033645
6,w2v_d2v_kwords_lemmatize,"tf-idf_(2,3)",0.033645
7,w2v_d2v_kwords_lemmatize,"tf-idf_(2,4)",0.033645
8,w2v_d2v_kwords_lemmatize,"tf-idf_(2,5)",0.033645
9,w2v_d2v_kwords_lemmatize,"tf-idf_(3,3)",0.033645


In [12]:
exp_name = 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2'
orig_data, idxs, results = load_data_for_exp(exp_name)
vectors = load_vectors_for_exp(exp_name)
results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))

model_name = 'fasttext_200_10'
vector = vectors[model_name]
X = vector[idxs]
y = results['VahaEx'].to_numpy()

if 'tf-idf' in model_name:
    scaler = StandardScaler(with_mean=False)
else:
    scaler = StandardScaler()
X = scaler.fit_transform(X)

parmas = {'n_estimators': [100, 500, 1000], 
          'min_samples_split': [5, 25, 50], 
          'min_samples_leaf': [50, 100, 250],
          'max_depth': [3, 10, 25]}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor(random_state=42, n_iter_no_change=10)
gs = GridSearchCV(model, parmas, verbose=3, cv=4)

gs = gs.fit(X_train, y_train)

gs.score(X_test, y_test)

Fitting 4 folds for each of 81 candidates, totalling 324 fits
[CV 1/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=100; total time=  17.6s
[CV 2/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=100; total time= 1.0min
[CV 3/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=100; total time=  32.6s
[CV 4/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=100; total time=  34.8s
[CV 1/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=500; total time=  17.3s
[CV 2/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=500; total time= 1.4min
[CV 3/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=500; total time=  31.8s
[CV 4/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=500; total time=  34.2s
[CV 1/4] END max_depth=3, min_samples_leaf=50, min_samples_split=5, n_estimators=1000; total time=

[CV 2/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=500; total time= 1.9min
[CV 3/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=500; total time=  41.0s
[CV 4/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=500; total time=  38.4s
[CV 1/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=1000; total time=  30.7s
[CV 2/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=1000; total time= 1.9min
[CV 3/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=1000; total time=  40.9s
[CV 4/4] END max_depth=3, min_samples_leaf=250, min_samples_split=5, n_estimators=1000; total time=  38.1s
[CV 1/4] END max_depth=3, min_samples_leaf=250, min_samples_split=25, n_estimators=100; total time=  30.4s
[CV 2/4] END max_depth=3, min_samples_leaf=250, min_samples_split=25, n_estimators=100; total time=  59.7s
[CV 3/4] END max_depth=3, min_samples_le

[CV 3/4] END max_depth=10, min_samples_leaf=100, min_samples_split=5, n_estimators=1000; total time= 2.6min
[CV 4/4] END max_depth=10, min_samples_leaf=100, min_samples_split=5, n_estimators=1000; total time= 1.3min
[CV 1/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=100; total time= 1.0min
[CV 2/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=100; total time= 2.3min
[CV 3/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=100; total time= 2.5min
[CV 4/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=100; total time= 1.3min
[CV 1/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=500; total time=  59.2s
[CV 2/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=500; total time= 2.3min
[CV 3/4] END max_depth=10, min_samples_leaf=100, min_samples_split=25, n_estimators=500; total time= 2.5min
[CV 4/4] END max_depth=10, m

[CV 3/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=100; total time= 1.7min
[CV 4/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=100; total time= 5.5min
[CV 1/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=500; total time= 1.2min
[CV 2/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=500; total time= 7.3min
[CV 3/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=500; total time= 1.8min
[CV 4/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=500; total time= 5.7min
[CV 1/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=1000; total time= 1.3min
[CV 2/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=1000; total time= 7.4min
[CV 3/4] END max_depth=25, min_samples_leaf=50, min_samples_split=25, n_estimators=1000; total time= 1.9min
[CV 4/4] END max_depth=25, min_sam

[CV 4/4] END max_depth=25, min_samples_leaf=250, min_samples_split=25, n_estimators=500; total time= 3.4min
[CV 1/4] END max_depth=25, min_samples_leaf=250, min_samples_split=25, n_estimators=1000; total time= 1.4min
[CV 2/4] END max_depth=25, min_samples_leaf=250, min_samples_split=25, n_estimators=1000; total time= 4.8min
[CV 3/4] END max_depth=25, min_samples_leaf=250, min_samples_split=25, n_estimators=1000; total time= 6.9min
[CV 4/4] END max_depth=25, min_samples_leaf=250, min_samples_split=25, n_estimators=1000; total time= 3.4min
[CV 1/4] END max_depth=25, min_samples_leaf=250, min_samples_split=50, n_estimators=100; total time= 1.4min
[CV 2/4] END max_depth=25, min_samples_leaf=250, min_samples_split=50, n_estimators=100; total time= 3.7min
[CV 3/4] END max_depth=25, min_samples_leaf=250, min_samples_split=50, n_estimators=100; total time= 3.9min
[CV 4/4] END max_depth=25, min_samples_leaf=250, min_samples_split=50, n_estimators=100; total time= 3.4min
[CV 1/4] END max_depth=2

0.11070697772300153

In [20]:
exp_name = 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2'
orig_data, idxs, results = load_data_for_exp(exp_name)
vectors = load_vectors_for_exp(exp_name)
results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))

model_name = 'fasttext_200_10'
vector = vectors[model_name]
X = vector[idxs]
y = results['Vaha'].to_numpy()

if 'tf-idf' in model_name:
    scaler = StandardScaler(with_mean=False)
else:
    scaler = StandardScaler()
X = scaler.fit_transform(X)

parmas = {'n_estimators': [100, 500, 1000], 
          'min_samples_split': [5, 25, 50], 
          'min_samples_leaf': [50, 100, 250],
          'max_depth': [3, 10, 25]}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor(random_state=42, n_iter_no_change=50, n_estimators=100, min_samples_split=5, min_samples_leaf=100, max_depth=50)

model = model.fit(X_train, y_train)

model.score(X_test, y_test)

0.14400058907411029

In [27]:
exp_name = 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2'
orig_data, idxs, results = load_data_for_exp(exp_name)
vectors = load_vectors_for_exp(exp_name)
results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))

model_name = 'fasttext_200_10'
vector = vectors[model_name]
X = vector[idxs]
y = results['Vaha'].to_numpy()

if 'tf-idf' in model_name:
    scaler = StandardScaler(with_mean=False)
else:
    scaler = StandardScaler()
X = scaler.fit_transform(X)

parmas = {'n_estimators': [100, 500, 1000], 
          'min_samples_split': [5, 25, 50], 
          'min_samples_leaf': [50, 100, 250],
          'max_depth': [3, 10, 25]}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVR(kernel='rbf', cache_size=1000, epsilon=.01, shrinking=False, C=.99)

model = model.fit(X_train, y_train)

model.score(X_test, y_test)

0.015797000879581802

In [28]:
exp_name = 'tf_w2v_d2v_fast_cz_nolemma_naz_anot_obor_uc_res_pos_2'
orig_data, idxs, results = load_data_for_exp(exp_name)
vectors = load_vectors_for_exp(exp_name)
results['VycetVysledku'] = results['VycetVysledku'].fillna('') 
results['VahaEx'] = results['VycetVysledku'].apply(assign_weights)
results['Vaha'] = results['VycetVysledku'].apply(lambda x: len(x.split(';')))

model_name = 'fasttext_200_10'
vector = vectors[model_name]
X = vector[idxs]
y = results['Vaha'].to_numpy()

if 'tf-idf' in model_name:
    scaler = StandardScaler(with_mean=False)
else:
    scaler = StandardScaler()
X = scaler.fit_transform(X)

parmas = {'n_estimators': [100, 500, 1000], 
          'min_samples_split': [5, 25, 50], 
          'min_samples_leaf': [50, 100, 250],
          'max_depth': [3, 10, 25]}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MLPRegressor(hidden_layer_sizes=(256, 64), random_state=42)

model = model.fit(X_train, y_train)

model.score(X_test, y_test)

0.2107702671263345

In [62]:
pred = model.predict(X)

In [63]:
dfr = results.rename(columns={'VycetVysledku': 'Výčet druhů výsledků', 'Vaha': 'Bodové ohodnocení', 'VahaEx': 'Vážené bodové ohodnocení'}).copy()

In [66]:
pd.set_option('display.max_colwidth', None)
dfr = dfr[['Název česky', 'Výčet druhů výsledků', 'Bodové ohodnocení']]
dfr['Predikované bodové ohodnocení'] = pred

In [70]:
dfr.sample(10)

Unnamed: 0_level_0,Název česky,Výčet druhů výsledků,Bodové ohodnocení,Predikované bodové ohodnocení
Kód projektu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GA203/98/1330,"Uplatnění (CH2-O), (CH2-S) a (CH2-CH2) isosterů amidové a disulfidové vazby v syntéze bioaktivních peptidů s prodlouženým účinkem",D ;D ;D,3,4.330835
IA31213,Hlubinná geoelektrická stavba jihozápadního okraje Českého masivu,,1,1.555721
GA522/02/1227,Regulační aspekty přímé somatické embryogeneze u hrachu (Pisum sativum L.),D,1,0.474226
GA102/96/1610,Nové technologie přípravy optoelektronických struktur,,1,8.022363
LTC17067,Využití NMR relaxačních a difúzních měření pro stanovení dynamiky komplexních molekulárních systémů,Jimp ;Jimp ;Jimp ;Jimp ;Jimp ;Jimp,6,3.75415
GA203/95/0650,Chemické aplikace indexů podobnosti,Jx ;Jx ;Jx ;D ;Jx ;Jx ;D ;D ;Jx ;Jx ;Jx ;Jx,12,13.377961
GA103/02/0243,Odstraňování huminových látek z přírodních vod,D ;D ;D ;D ;D ;D ;D ;D ;D ;D ;D ;D ;Jx ;D ;D ;D ;D ;D ;A ;A ;D ;O ;Jx ;D ;D ;A ;D ;D ;D ;D ;D ;C,32,19.685251
GA526/03/1485,"Produkční potenciál a stabilita smíšených lesních porostů ve 2., 3. a 4. lesním vegetačním stupni jako podklad pro optimalizaci cílové skladby dřevin",Jx ;Jx ;D ;D ;D ;Jx ;D ;D ;D ;Jx,10,10.3075
VS97046,Centrum mikrosystémů,D ;Jx ;D ;D ;O ;O,6,8.889691
GA312/95/0583,Fenotypická reverse savčí buňky transformované onkogenem v-src - model pro studium buněčných mechanismů regulace provirové exprese,Jx ;Jx,2,3.623506
