In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
from re import sub
import multiprocessing
import re
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

In [2]:
def preprocess(text):
    text = str(text)

    #alpha numeric, punctuation remove kortesi
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w == 'no' or w == 'not' or not w in stop_words]
    porter = PorterStemmer()
    stems = []
    for t in tokens:
        stems.append(porter.stem(t))

    clean_words = []
    rx = re.compile(r'\D*\d')
    for s in stems:
        if rx.match(s):
            continue 
        clean_words.append(s)
    return clean_words

In [3]:
dataset = pd.read_csv("disease_drug_sentiment_textblob_and_outlier_flag.csv")

In [4]:
disease_list = dataset['disease'].tolist()
drug_list = dataset['drug'].tolist()
article_list = dataset['sentences'].tolist()

In [5]:
processed_article_list = []

for article in article_list:
    processed_article = preprocess(article)
    processed_article_list.append(processed_article)

In [6]:
sent = processed_article_list #just to keep it as source
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [7]:
w2v_model = Word2Vec(min_count=5, window=4, vector_size=300, sample=1e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=multiprocessing.cpu_count())

In [8]:
w2v_model.build_vocab(sentences, progress_per=50000)

In [9]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(2332907, 16841760)

In [10]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [11]:
w2v_model.save("word2vec.model")

In [12]:
new_article_list = []
for article in processed_article_list:
    new_article = ' '.join(bigram[article])
    new_article_list.append(new_article)

In [13]:
zippedList =  list(zip(disease_list, drug_list, new_article_list))
clean_data = pd.DataFrame(zippedList, columns = ['disease' , 'drug', 'title']) 
clean_data.to_csv('cleaned_dataset.csv', index=False)

In [15]:
clean_data.head()

Unnamed: 0,disease,drug,title
0,influenza,Procalcitonin,respiratori viru influenza a influenza b_parai...
1,influenza,Gold,the strength studi influenza infect confirm_rt...
2,influenza,L-Glutamine,wt_mutant ha_influenza virus_a/hong propag_day...
3,influenza,Streptomycin,wt_mutant ha_influenza virus_a/hong propag_day...
4,influenza,Verdinexor,in influenza_viru infect mice verdinexor shown...


In [18]:
ab_dataset = pd.read_csv('Dataset/dataset_3_abdullah.csv')

In [19]:
ab_dataset.rename(columns={'chosen_sentence': 'sentences'}, inplace=True)

In [20]:
article_list_ab = ab_dataset['sentences'].tolist()

In [21]:
processed_article_list_ab = []

for article in article_list_ab:
    processed_article = preprocess(article)
    processed_article_list_ab.append(processed_article)

In [24]:
new_article_list_ab = []
for article in processed_article_list_ab:
    new_article = ' '.join(bigram[article])
    new_article_list_ab.append(new_article)

In [26]:
ab_dataset['title'] = new_article_list_ab

In [28]:
ab_dataset.to_csv('Dataset/dataset_3_abdullah.csv', index=False)

## Get title values for combined dataset

In [29]:
combined_dataset = pd.read_csv('Dataset/Combined_Dataset_2022-18-02.csv')

In [33]:
article_list_combined = combined_dataset['sentences'].tolist()

In [34]:
processed_article_list_combined = []

for article in article_list_combined:
    processed_article = preprocess(article)
    processed_article_list_combined.append(processed_article)

In [35]:
new_article_list_combined = []
for article in processed_article_list_combined:
    new_article = ' '.join(bigram[article])
    new_article_list_combined.append(new_article)

In [36]:
combined_dataset['title'] = new_article_list_combined

In [41]:
combined_dataset.to_csv('Dataset/Combined_Dataset_2022-18-02.csv', index=False)