In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim import corpora
import pickle
import re

In [2]:
def clean_up(x):
    x = re.sub('[^A-Za-z ]+', '', x)
    return x.lower()

In [3]:
df = pd.read_csv("data/drug_review/drugsComTrain_raw.tsv",delimiter="\t")[:100000]
reviews = df.review.apply(lambda x : clean_up(x))
reviews

0        it has no side effect i take it in combination...
1        my son is halfway through his fourth week of i...
2        i used to take another oral contraceptive whic...
3        this is my first time using any form of birth ...
4        suboxone has completely turned my life around ...
5        nd day on mg started to work with rock hard er...
6        he pulled out but he cummed a bit in me i took...
7        abilify changed my life there is hope i was on...
8         i ve had  nothing but problems with the keppe...
9        i had been on the pill for many years when my ...
10       i have been on this medication almost two week...
11       i have taken antidepressants for years with so...
12       i had crohns with a resection  years ago and h...
13       have a little bit of a lingering cough from a ...
14       started nexplanon  months ago because i have a...
15       i have been taking saxenda since july   i had ...
16       this drug worked very well for me and cleared .

#1. Dict
#2. Doc -> tokens token2id ##also remove duplicates
#3. Doc -> token_count 

In [4]:

texts = [[text for text in review.split()] for review in reviews]
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(64687 unique tokens: ['and', 'bystolic', 'combination', 'effect', 'fish']...)


In [5]:
def prepare_df(df,col="review"):
    df = pd.DataFrame(reviews,columns=[col])
    df["bow"] = df[col].apply(lambda x : dictionary.doc2bow(x.split()))
    df["tokens"] = df.bow.apply(lambda x: np.array([[i[0] for i in x]]))
    df["counts"] = df.bow.apply(lambda x: np.array([[i[1] for i in x]]))
    df.drop(['bow','review'],axis=1,inplace=True)
    return df

In [6]:
df = prepare_df(df)
df.head()

Unnamed: 0,tokens,counts
0,"[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]]"
1,"[[0, 6, 7, 8, 11, 15, 16, 17, 18, 19, 20, 21, ...","[[4, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,"[[0, 6, 7, 8, 10, 11, 13, 14, 30, 41, 43, 50, ...","[[4, 5, 1, 1, 1, 3, 3, 1, 1, 2, 1, 2, 2, 3, 1,..."
3,"[[6, 7, 8, 11, 18, 19, 22, 30, 41, 50, 59, 69,...","[[4, 1, 3, 2, 1, 1, 1, 3, 3, 2, 2, 5, 1, 1, 4,..."
4,"[[0, 5, 6, 7, 8, 11, 15, 17, 18, 19, 25, 41, 4...","[[5, 1, 7, 2, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3,..."


In [7]:
train , valid = df.iloc[:int(df.shape[0]*0.9),:],df.iloc[int(df.shape[0]*0.9):,:]
print(train.shape)
print(valid.shape)

(90000, 2)
(10000, 2)


In [8]:

test = pd.read_csv("data/drug_review/drugsComTest_raw.tsv",delimiter="\t")
reviews = test.review.apply(lambda x : clean_up(x))
texts = [[text for text in review.split()] for review in reviews]
dictionary.add_documents(texts)

In [9]:
test = prepare_df(test)
print(test.shape)
test.head()

(53766, 2)


Unnamed: 0,tokens,counts
0,"[[0, 5, 6, 9, 10, 11, 13, 15, 33, 40, 50, 67, ...","[[2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1,..."
1,"[[0, 5, 10, 11, 13, 18, 22, 51, 54, 62, 69, 71...","[[2, 4, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2,..."
2,"[[11, 376, 1400, 4636]]","[[1, 1, 1, 1]]"
3,"[[0, 6, 8, 10, 11, 15, 22, 30, 40, 41, 44, 47,...","[[2, 10, 5, 1, 1, 5, 2, 2, 2, 2, 1, 1, 4, 3, 1..."
4,"[[0, 6, 8, 10, 11, 13, 15, 22, 39, 41, 45, 50,...","[[6, 10, 2, 1, 3, 1, 3, 4, 1, 2, 1, 4, 3, 6, 1..."


In [10]:
test , test_1,test_2 = test.iloc[:int(test.shape[0]*0.5),:],test.iloc[int(test.shape[0]*0.5):int(test.shape[0]*0.75),:],test.iloc[int(test.shape[0]*0.75):,:]
print(test.shape)
print(test_1.shape)
print(test_2.shape)
# test_1_tokens = test['tokens_1']
# test_1_counts = test['counts_1']

(26883, 2)
(13441, 2)
(13442, 2)


## Saving The Files

In [11]:
path = "data/drug_review/"

In [12]:
# train.to_csv(path+"train.csv",index=None)
# test.to_csv(path+"test.csv",index=None)
# valid.to_csv(path+"valid.csv",index=None)
dictionary.save(path+"vocab.pkl")

In [13]:
def store_bow_files(df,filename):
    tokens = df.tokens.tolist()
    counts = df.counts.tolist()
    dict_ = {"tokens":tokens,"counts":counts}
    with open(path+filename+'.pkl', 'wb') as handle:
        pickle.dump(dict_, handle)


In [14]:
store_bow_files(train,"train_dict")
store_bow_files(test,"test_dict")
store_bow_files(test_1,"test_1_dict")
store_bow_files(test_2,"test_2_dict")
store_bow_files(valid,"valid_dict")

## Validating The stored Files

In [15]:
with open(path + 'train_dict.pkl', 'rb') as handle:
    train = pickle.load(handle)
train

{'tokens': [array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]]),
  array([[  0,   6,   7,   8,  11,  15,  16,  17,  18,  19,  20,  21,  22,
           23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
           36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
           49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
           62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
           75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
           88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
          101, 102, 103, 104, 105]]),
  array([[  0,   6,   7,   8,  10,  11,  13,  14,  30,  41,  43,  50,  59,
           69,  71,  76,  82,  84,  86,  88,  90,  93,  95,  99, 100, 102,
          103, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
          117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
          130, 131, 132, 133, 134, 135, 136, 137, 138,

In [16]:
with open(path + 'test_dict.pkl', 'rb') as handle:
    test = pickle.load(handle)
test

{'tokens': [array([[   0,    5,    6,    9,   10,   11,   13,   15,   33,   40,   50,
            67,   69,   88,   91,   94,  105,  115,  123,  130,  155,  191,
           214,  215,  218,  237,  241,  245,  291,  298,  309,  317,  355,
           379,  422,  480,  568,  597,  610,  625,  643,  658,  713,  861,
          1326, 1660, 2225, 2487, 3336, 3338, 3483, 3659, 6455]]),
  array([[    0,     5,    10,    11,    13,    18,    22,    51,    54,
             62,    69,    71,    83,    88,    99,   105,   110,   120,
            123,   131,   185,   316,   441,   447,   450,   472,   473,
            547,   554,   591,   610,  1077,  1348,  1544,  3685,  4694,
           4915,  6043, 10894]]),
  array([[  11,  376, 1400, 4636]]),
  array([[    0,     6,     8,    10,    11,    15,    22,    30,    40,
             41,    44,    47,    50,    59,    67,    69,    71,    82,
             86,    88,    91,    93,    95,   104,   108,   110,   112,
            115,   134,   137,   143,

In [17]:
with open(path + 'valid_dict.pkl', 'rb') as handle:
    valid = pickle.load(handle)
valid

{'tokens': [array([[    0,     6,     7,     8,    11,    15,    18,    31,    50,
             66,    69,    71,    88,    93,    97,   100,   103,   105,
            115,   124,   130,   185,   188,   209,   215,   262,   287,
            288,   315,   316,   324,   410,   435,   485,   488,   493,
            505,   529,   533,   538,   543,   565,   580,   606,   649,
            688,   758,   856,  1058,  1108,  1150,  1318,  1325,  1328,
           1588,  1605,  1659,  1713,  1812,  1840,  1878,  2108,  2707,
           2966,  3027,  3077,  3119,  3591,  3745,  5011,  5155,  5473,
           5806,  6383,  7373,  8039, 12928, 15125, 20067, 23288, 29966,
          40357, 41940, 48604, 48605, 48606, 48607]]),
  array([[    0,     6,     7,     8,    11,    13,    14,    15,    17,
             18,    33,    39,    44,    45,    47,    50,    59,    65,
             66,    68,    69,    71,    82,    84,    88,    93,   100,
            112,   115,   120,   123,   142,   143,   158, 