In [1]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
from nltk.probability import FreqDist
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import pickle
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences
import copy

In [2]:
# !pip install tf-nightly

In [3]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
def word_count(s):
    return len(s.split())

In [5]:
def striphtml(text):
    p=re.compile('<.*?>')
    return p.sub('',text)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")

In [8]:
df.shape

(50000, 2)

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['review'] = df['review'].apply(striphtml)

In [11]:
df = df.drop(df[df['review'].apply(word_count) < 100].index)

In [12]:
df.duplicated().sum()

358

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.shape

(43279, 2)

In [15]:
midpoint = len(df) // 2
df1 = df.iloc[:midpoint]
df2 = df.iloc[midpoint:]

In [16]:
# Authenticate and create a GoogleDrive instance
gauth = GoogleAuth()
drive = GoogleDrive(gauth)

In [17]:
# cv = CountVectorizer(lowercase=True,ngram_range=(1, 3))

In [18]:
vectorizer = TextVectorization(
    max_tokens=None,
    standardize = 'lower_and_strip_punctuation',
    output_mode='int',
    output_sequence_length=8000,
    ngrams=(1, 3)
)

In [19]:
sampled_data = df1.sample(n=15000, random_state=14)
# n can be changed as required

In [20]:
encoder = LabelEncoder()
y=encoder.fit_transform(sampled_data.sentiment)

In [21]:
y.shape

(15000,)

In [22]:
X_train=sampled_data.review
y_train=y

In [23]:
X_train.shape

(15000,)

In [24]:
y_train.shape

(15000,)

In [25]:
with tf.device('/gpu:0'):
  vectorizer.adapt(X_train)
  X_train_bow = vectorizer(X_train)
  X_train_bow = pad_sequences(X_train_bow, maxlen=8000)
  # X_train_bow = cv.fit_transform(X_train['review']).toarray()

In [26]:
X_train_bow.shape

(15000, 8000)

In [27]:
X_train_bow

array([[    4,   961, 57465, ...,     0,     0,     0],
       [ 1253,    11,    25, ...,     0,     0,     0],
       [   11,   208,    10, ...,     0,     0,     0],
       ...,
       [   10,     7,   184, ...,     0,     0,     0],
       [   11,   221,   210, ...,     0,     0,     0],
       [   29,     4,  1398, ...,     0,     0,     0]], dtype=int32)

In [28]:
len(vectorizer.get_vocabulary())

2634886

In [29]:
with tf.device('/gpu:0'):
  oracle = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=8000, output_dim=300, input_length=8000),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))
  ])
  oracle.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  oracle.fit(X_train_bow, y_train, epochs=10, batch_size=32, validation_split=0.2)
  # oracle = LogisticRegression(penalty='l2', C=0.01, multi_class='ovr').fit(X_train_bow, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Pickle and save the model
with open('oracle.pkl', 'wb') as model_file:
    pickle.dump(oracle, model_file)

In [31]:
def answer(oracle,x):
  with tf.device('/gpu:0'):
    print(oracle(x),1 - max(oracle(x)))
    if 1 - max(oracle(x)) > 0.5:return -1
    return oracle.predict(x)

## **Anytime Active Learning**



###\ Static AAL

In [32]:
def static_k_unc(Xik,PretrainedClassifier):
  return 1-PretrainedClassifier.predict_proba(Xik)

In [33]:
def static_k_const(Xik,PretrainedClassifier):
  return 1

In [34]:
def generate_Uk(U,k):
  with tf.device('/gpu:0'):
    Uk = U.apply(lambda x: ' '.join(x.split()[:k]))
    vectorizer.adapt(Uk)
    Uk = vectorizer(Uk)
    Uk = pad_sequences(Uk, maxlen=8000)
    return Uk

In [35]:
sampled_data = df2.sample(n=1000, random_state=14)

In [36]:
y=encoder.fit_transform(sampled_data.sentiment)

In [37]:
X_train,X_test,y_train,y_test = train_test_split(sampled_data.iloc[:,0:1],y,test_size=0.5,random_state=1)

In [38]:
with tf.device('/gpu:0'):
  vectorizer.adapt(X_train)
  X_train_bow = vectorizer(X_train)
  X_train_bow = pad_sequences(X_train_bow, maxlen=8000)
  X_test_bow = vectorizer(X_train)
  X_test_bow = pad_sequences(X_train_bow, maxlen=8000)
  # X_train_bow = cv.fit_transform(X_train['review']).toarray()

In [39]:
def select_subinstance(U,classifier):
  with tf.device('/gpu:0'):
    vectorizer.adapt(U)
    U = vectorizer(U)
    U = pad_sequences(U, maxlen=8000)
  return np.argmin(classifier(U))

In [40]:
U = df2.sample(n=1000, random_state=14)['review']
U_=copy.deepcopy(U)
with tf.device('/gpu:0'):
  vectorizer.adapt(U)
  U = vectorizer(U)
  U = pad_sequences(U, maxlen=8000)
# U = cv.transform(X_test['review']).toarray()

In [41]:
# with tf.device('/gpu:0'):
#   U_=copy.deepcopy(U)
#   vectorizer.adapt(U_)
#   U_ = vectorizer(U_)
#   U_ = pad_sequences(U, maxlen=8000)

In [42]:
k = 25

In [43]:
Uk = generate_Uk(U_,k)

In [44]:
with tf.device('/gpu:0'):
  PretrainedClassifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=8000, output_dim=300, input_length=8000),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
  ])
  PretrainedClassifier.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  PretrainedClassifier.fit(X_train_bow,y_train)
  # PretrainedClassifier = LogisticRegression(penalty='l2', C=0.01, multi_class='ovr').fit(X_train_bow, y_train)



In [45]:
L = X_train_bow

In [46]:
C={10:5.7,25:8.2,50:10.9,75:15.9,100:16.7}

In [47]:
Ck = 8.2

## Algorithm 1

In [48]:
with open('oracle.pkl', 'rb') as model_file:
    oracle = pickle.load(model_file)

In [49]:
L.shape

(500, 8000)

In [50]:
Uk.shape

(1000, 8000)

In [51]:
auc=[]
cost=[]

In [54]:
# Budget of 50 minutes
B = 1500
with tf.device('/gpu:0'):
  while B>0:
    index = select_subinstance(U_,PretrainedClassifier)
    Xik = Uk[index]
    Xi = U[index]
    U=np.concatenate((U[:index], U[index + 1:]))
    Uk=np.concatenate((Uk[:index], Uk[index + 1:]))
    U_=np.concatenate((U_[:index], U_[index + 1:]))
    a = answer(oracle,tf.expand_dims(Xik, axis=0))
    print(B,a)
    B = B - Ck
    if a!=-1:
      print("\n\nhoorah\n\n")
      L = np.concatenate((L, [Xi]))
      y_train = np.concatenate((y_train,[a]))
      PretrainedClassifier.fit(L, y_train)
      vectorizer.adapt(X_test)
      # X_test_bow = cv.transform(X_test['review']).toarray()
      y_test_pred = PretrainedClassifier(U)
      auc.append(roc_auc_score(y_test, y_test_pred))
      cost.append(3000-B)

InternalError: ignored

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(cost, auc)
plt.xlabel("Cost")
plt.ylabel("AUC")
plt.grid(True)
plt.show()