# Step 12: Unsupervised sentiment prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)
    return list(map(lambda y: dictionary[f'{y.lower()}'], x.title.split()))


def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [3]:
final_file = pd.read_csv('cleaned_dataset.csv')

In [4]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [5]:
file_weighting = final_file.copy()

In [6]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)



In [18]:
features

0                              /
1                  //clinicaltri
2              //www_brugadadrug
3        //www_int/teams/control
4                            /_/
                  ...           
21492             zuclopenthixol
21493                   zusammen
21494                      zuvor
21495           zykl_atoltivimab
21496              zymosan_induc
Length: 21497, dtype: object

In [7]:
#this step takes around 3-4 minutes minutes to calculate
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [8]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [9]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)

In [12]:
replacement_df['prediction'] = (replacement_df.sentiment_rate > 0).astype('int8')

In [14]:
replacement_df[replacement_df['prediction'] != 1]

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
11,"[0.8839565528366733, -0.7722962736902768, -0.7...","[3.6745040786110064, 2063.958370971322, 14.496...",immunodefici_syndrom or mening or substanc_abu...,-936411.647911,0
12,"[1.456936426648782, 1.204730289144018, -0.7874...","[3.3439402786759596, 8.826706825303614, 7.6130...",the two studi assess_poc xpert_test south_afri...,-154.683988,0
23,"[0, 0, 3.132533494795469, 0, 2.385037471577086...","[8.654680165222555, 8.654680165222555, 7.55606...",ellag_acid transcreen tm adp assay ntpase hrtf...,-495.232033,0
24,"[0.8839565528366733, -0.7722962736902768, -0.7...","[3.6745040786110064, 2063.958370971322, 14.496...",immunodefici_syndrom or mening or substanc_abu...,-936411.647911,0
25,"[0.8839565528366733, -0.7722962736902768, -0.7...","[3.6745040786110064, 2063.958370971322, 14.496...",immunodefici_syndrom or mening or substanc_abu...,-936411.647911,0
...,...,...,...,...,...
4212,"[-1.315919922487211, -1.2448841391919236, -1.2...","[6.639777144680289, 6.639777144680289, 6.57523...",adapt_http //www_int/teams/control neglect_tro...,-209.209717,0
4213,"[-1.315919922487211, -1.2448841391919236, -1.2...","[6.639777144680289, 6.639777144680289, 6.57523...",adapt_http //www_int/teams/control neglect_tro...,-209.209717,0
4214,"[-1.315919922487211, -1.2448841391919236, -1.2...","[6.639777144680289, 6.639777144680289, 6.57523...",adapt_http //www_int/teams/control neglect_tro...,-209.209717,0
4215,"[-1.315919922487211, -1.2448841391919236, -1.2...","[6.639777144680289, 6.639777144680289, 6.57523...",adapt_http //www_int/teams/control neglect_tro...,-292.773016,0


In [15]:
replacement_df.to_csv('predicted_result.csv', columns=['sentence','sentiment_rate', 'prediction'], index=False)

In [16]:
outdata = pd.read_csv("disease_drug_sentiment_textblob_and_outlier_flag.csv")
outdata['sentiment_rate_unsupervised'] = replacement_df['sentiment_rate']
outdata['label_unsupervised'] = replacement_df['prediction']
outdata['sentiment_sentence'] = replacement_df['sentence']
outdata.to_csv("final_output.csv", index=False)

In [19]:
outdata

Unnamed: 0,disease,drug,sentences,distance,outlier_flag,avg_polarity,sentiment_rate_unsupervised,label_unsupervised,sentiment_sentence
0,influenza,Procalcitonin,"Respiratory virus (influenza A, influenza B, p...",20,1,-0.037500,55.993455,1,respiratori viru influenza a influenza b_parai...
1,influenza,Gold,The strengths of this study are that: (i) infl...,9,1,-0.011310,115.808610,1,the strength studi influenza infect confirm_rt...
2,influenza,L-Glutamine,WT and mutant HA-Y161A influenza viruses A/Hon...,65,0,-0.100000,99.674435,1,wt_mutant ha_influenza virus_a/hong propag_day...
3,influenza,Streptomycin,WT and mutant HA-Y161A influenza viruses A/Hon...,72,0,-0.100000,99.674435,1,wt_mutant ha_influenza virus_a/hong propag_day...
4,influenza,Verdinexor,"In influenza virus-infected mice, verdinexor w...",3,1,0.000000,42.969673,1,in influenza_viru infect mice verdinexor shown...
...,...,...,...,...,...,...,...,...,...
4215,necatoriasis,Albendazole,� Adapted from https://www.who.int/teams/contr...,15,1,0.098634,-292.773016,0,adapt_http //www_int/teams/control neglect_tro...
4216,necatoriasis,Mebendazole,� Adapted from https://www.who.int/teams/contr...,13,1,0.098634,-292.773016,0,adapt_http //www_int/teams/control neglect_tro...
4217,histiocytoma,Risankizumab,Three patients had previous malignancy (malign...,10,1,-0.041667,207.131872,1,three patient previou malign malign fibrou his...
4218,neuroacanthocytosis,Iron,"further write that ""Hereditary etiologies of c...",13,1,0.000000,212.260935,1,write hereditari etiolog chorea includ hunting...
