In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

warnings.filterwarnings("ignore")

import nltk
nltk.download('stopwords')

import regex as re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_drug  = pd.read_csv("recommendation_data.csv")
data_drug.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,clean_review,skor_sentimen,sentimen,recommendation_score
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,0.020914,no side effect take combin bystol mg fish oil,-0.296,0,0.0
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,27-Apr-10,0.148722,son halfway fourth week intuniv becam concern ...,0.6929,1,0.148722
2,92703,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,0.013168,use take anoth oral contracept pill cycl happi...,0.2732,1,0.013168
3,138000,ortho evra,birth control,"""This is my first time using any form of birth...",8,3-Nov-15,0.007746,first time use form birth control glad went pa...,0.1027,1,0.007746
4,35696,buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,0.02866,suboxon complet turn life around feel healthie...,0.8934,1,0.02866


In [3]:
print('Shape dataset:', data_drug.shape)

Shape dataset: (202783, 11)


In [4]:
data_drug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202783 entries, 0 to 202782
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   uniqueID              202783 non-null  int64  
 1   drugName              202783 non-null  object 
 2   condition             202783 non-null  object 
 3   review                202783 non-null  object 
 4   rating                202783 non-null  int64  
 5   date                  202783 non-null  object 
 6   usefulCount           202783 non-null  float64
 7   clean_review          202783 non-null  object 
 8   skor_sentimen         202783 non-null  float64
 9   sentimen              202783 non-null  int64  
 10  recommendation_score  202783 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 17.0+ MB


In [5]:
# Cleaning func

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [6]:
# Cleaning

def preprocess_text(text_data):

    text_data = decontracted(text_data)

    text_data = text_data.replace('\n',' ')
    text_data = text_data.replace('\r',' ')
    text_data = text_data.replace('\t',' ')
    text_data = text_data.replace('-',' ')
    text_data = text_data.replace("/",' ')
    text_data = text_data.replace(">",' ')
    text_data = text_data.replace('"',' ')
    text_data = text_data.replace('?',' ')
    return text_data

In [7]:
# Stopword

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# menghapus 'no' dari daftar stopwords karena pentingnya 'side effects' dan 'no side effects' dalam kolom review
stop_words.remove('no')

# Stemming
def nlp_preprocessing(review):
    '''untuk menghapus digit, spasi berlebih, kata2 yg tdk diingankan dan mengkonversi ke huruf kecil lalu melakukan stemming'''

    if type(review) is not int:
        string = ""
        review = preprocess_text(review)
        review = re.sub('[^a-zA-Z]', ' ', review)

        review = re.sub('\s+',' ', review)

        review = review.lower()

        for word in review.split():

            if not word in stop_words:
                word = stemmer.stem(word)
                string += word + " "

        return string

In [8]:
# Kolom baru 'clean_review'
data_drug['drugName'] = data_drug['drugName'].apply(nlp_preprocessing)
data_drug['condition'] = data_drug['condition'].apply(nlp_preprocessing)

NameError: name 'data' is not defined

In [9]:
# Menggabungkan kolom A dan B menjadi kolom baru C
data_drug['full_konteks'] = data_drug['drugName'] + data_drug['condition'] + data_drug['clean_review']

In [10]:
vectorizer = TfidfVectorizer()
X_new = vectorizer.fit_transform([x.lower() for x in data_drug['full_konteks']])

In [20]:
def searching(word, limit=5):
    word = re.sub('[^a-zA-Z0-9 ]','', word.lower()) # match everyting that's not alphabet and digit and remove it
    query_vec = vectorizer.transform([word])
    similarity = cosine_similarity(query_vec, X_new).flatten()
    
    filtered = np.where(similarity != 0)[0]
    indices = np.argsort(-similarity[filtered])
    correct_indices = filtered[indices]
    result = data_drug.iloc[correct_indices]
    
    if not len(result):
        return 'Result not found'
    
    overall =  result['recommendation_score'] *  similarity[correct_indices] 
    
    return result.loc[overall.sort_values(ascending=False).index].head(limit)

In [21]:
searching("left ventricular dysfunction")

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,clean_review,skor_sentimen,sentimen,recommendation_score,full_konteks
169487,160759,buspirone,"sexual dysfunction, ssri induced","""I found that Buspar helped quite a bit in all...",8,27-Nov-10,0.223857,found buspar help quit bit allow orgasm also q...,0.5574,1,0.223857,"buspironesexual dysfunction, ssri inducedfound..."
93951,204573,buspar,"sexual dysfunction, ssri induced","""I found that Buspar helped quite a bit in all...",8,27-Nov-10,0.223857,found buspar help quit bit allow orgasm also q...,0.5574,1,0.223857,"busparsexual dysfunction, ssri inducedfound bu..."
79770,160777,buspirone,"sexual dysfunction, ssri induced","""I have taken many antidepressants and a lot o...",2,19-Apr-10,0.150271,taken mani antidepress lot took away desir sex...,0.5267,1,0.150271,"buspironesexual dysfunction, ssri inducedtaken..."
135552,204584,buspar,"sexual dysfunction, ssri induced","""I have taken many antidepressants and a lot o...",2,19-Apr-10,0.150271,taken mani antidepress lot took away desir sex...,0.5267,1,0.150271,"busparsexual dysfunction, ssri inducedtaken ma..."
134066,216457,sildenafil,"sexual dysfunction, ssri induced","""I&#039;m 40 years old and split 100mg in half...",10,21-Oct-11,0.090627,year old split mg half mg strong enough erect ...,0.875,1,0.090627,"sildenafilsexual dysfunction, ssri inducedyear..."
