In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from anchor import utils
import lime
import lime.lime_tabular
from int_met import *

  from numpy.core.umath_tests import inner1d


In [2]:
dataset_folder = 'Datasets/drugs/'
train = pd.read_csv(str(dataset_folder+'train.csv'))
test = pd.read_csv(str(dataset_folder+'test.csv'))

In [3]:
df_all = pd.concat([train,test]).reset_index()
del df_all['index']

In [4]:
df_all.replace('', np.nan, inplace=True)
print(df_all.shape)
df_all.dropna(inplace=True)
print(df_all.shape)

(4143, 2)
(4141, 2)


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.model_selection import train_test_split

In [6]:
stops = set(stopwords.words('english'))
not_stop = ["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't","mightn't","mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","wouldn't"]
for i in not_stop:
    if i in stops:
        stops.remove(i)
    
stemmer = SnowballStemmer('english')
def review_to_words(raw_review):
    review_text = raw_review
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not w in stops]
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    return( ' '.join(stemming_words))

In [7]:
%time df_all['sideEffectsReview'] = df_all['sideEffectsReview'].apply(review_to_words)

Wall time: 1.87 s


In [8]:
def categorize_rating(x):
    if x in ['Mild Side Effects', 'No Side Effects']:
        return 0
    else:
        return 1

In [9]:
df_all['sideEffects'] = df_all["sideEffects"].apply(categorize_rating)

In [10]:
df_train, df_test = train_test_split(df_all, test_size=0.2, random_state=42) 

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(analyzer = 'word', 
                             tokenizer = None,
                             preprocessor = None, 
                             stop_words = None, 
                             min_df = 2,
                             max_features = 5000
                            )

In [12]:
%time X_train = vectorizer.fit_transform(df_train['sideEffectsReview'])
%time X_test = vectorizer.transform(df_test['sideEffectsReview'])

Wall time: 69 ms
Wall time: 0 ns


In [13]:
y_train = df_train['sideEffects']
y_test = df_test['sideEffects']

In [14]:
c = sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, random_state=1)
c.fit(X_train, y_train)
predict_fn = lambda x: c.predict(x)
print('Train', sklearn.metrics.accuracy_score(y_train, predict_fn(X_train)))
print('Test', sklearn.metrics.accuracy_score(y_test, predict_fn(X_test)))

Train 0.9978864734299517
Test 0.7949336550060314


In [15]:
from sklearn.pipeline import make_pipeline
pip_line = make_pipeline(vectorizer, c)

In [16]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [17]:
from anchor import anchor_text
explainer = anchor_text.AnchorText(nlp, ['sideEffects'], use_unk_distribution=True)

In [18]:
def predict_lr(texts):
    return c.predict(vectorizer.transform(texts))

In [19]:
exp_fn_bulk = lambda x: np.array([explainer.explain_instance(x.iloc[idx], predict_lr, threshold=0.95, use_proba=False).names() for idx in range(len(x))])

In [20]:
print(df_test.shape)
unique_df = df_test.drop_duplicates()
print(unique_df.shape)

(829, 2)
(751, 2)


In [22]:
%time exp1 = exp_fn_bulk(unique_df.sideEffectsReview[:400])
print('done')
%time exp2 = exp_fn_bulk(unique_df.sideEffectsReview[:400])

Wall time: 22min 57s
done
Wall time: 22min 25s


In [23]:
i = calc_identity(exp1,exp2)
print(i)

(21.75, 313, 400)


In [24]:
s = calc_separability(exp2)
print (s)

(7062, 400, 160000, 4.41375)


In [25]:
a = np.array([str(i) for i in exp2])

In [26]:
vec = vectorizer.transform(a)

In [28]:
np.array_equal(vec[1].indices,vec[1].indices)

True

In [29]:
v = vec.todense()

In [30]:
v.shape

(400, 2535)

In [31]:
sb = calc_stability(v, unique_df.sideEffects.iloc[:400].values)
print(sb)

  return_n_iter=True)


(122, 400)
