#**Using LIME to Understand Text Tokens' Effects in a Classifier**

We are going to train a simple text classifier (using our data for the detection of fake reviews). For any given review classification, we can see which terms most contributed to the resulting classification.

#*Load TripAdvisor Reviews from Git*

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from google.colab import files
import pandas as pd
import io
import numpy as np
# for keras
from tensorflow.keras import utils

In [None]:
# Just load the data from the Week 3 folder again.
df = pd.read_csv('https://raw.githubusercontent.com/ChunxiaqiuY/BA865-Advanced-Analytics_Team-Great/main/fake_job_postings.csv')
df = df.sample(frac=1) # Shuffle the data since I'll eventually just use a simple validation split.

df.describe(include='all')

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
count,17879.0,17879,17533,6332,2868,14571,17878,15184,10670,17879.0,17879.0,17879.0,14408,10829,9775,12977,11424,17879.0
unique,,11231,3105,1337,874,1709,14800,11968,6205,,,,5,7,13,131,37,
top,,English Teacher Abroad,"GB, LND, London",Sales,0-0,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it Love travel? J...",University degree required. TEFL / TESOL / CEL...,See job description,,,,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Information Technology,
freq,,311,718,551,142,726,379,410,726,,,,11620,3809,5145,1734,1749,
mean,8941.0,,,,,,,,,0.042899,0.795291,0.49175,,,,,,0.048437
std,5161.367067,,,,,,,,,0.202636,0.4035,0.499946,,,,,,0.214693
min,2.0,,,,,,,,,0.0,0.0,0.0,,,,,,0.0
25%,4471.5,,,,,,,,,0.0,1.0,0.0,,,,,,0.0
50%,8941.0,,,,,,,,,0.0,1.0,0.0,,,,,,0.0
75%,13410.5,,,,,,,,,0.0,1.0,1.0,,,,,,0.0


In [None]:
# Unknown for missing location
df.location = df.location.fillna('unknown')
# unknown for missing department
df.department = df.department.fillna('unknown')
# fill average for missing salary_range
job_salary = df.copy()
salary_temp = job_salary.salary_range.dropna()
ranges = []
#create lower bond and upper bond for salary 
salary_lower = []
salary_upper = []
for i in salary_temp:
  try:
    ranges.append(int(i.split("-")[0]))
    ranges.append(int(i.split("-")[1]))
  except:
    ranges.append(0)
np.asarray(ranges)
average_salary = np.average(ranges)
df.salary_range = df.salary_range.fillna(average_salary)
for i in df.salary_range:
  if i == average_salary:
    salary_lower.append(-1)
    salary_upper.append(-1)
  else: 
    try:
      salary_lower.append(int(i.split("-")[0]))
      salary_upper.append(int(i.split("-")[1]))
    except:
      salary_lower.append(-1)
      salary_upper.append(-1)
df['salary_lower'] = salary_lower[:17879]
df['salary_upper'] = salary_upper[:17879]
# Unknown for missing company_profile
df['company_profile'] = df['company_profile'].fillna('unknown')
# Unknown for missing description
df['description'] = df['description'].fillna('unknown')
# Unknown for missing requirements
df['requirements'] = df['requirements'].fillna('unknown')
# Unknown for missing employment, then replace the category value to numeric 
df['employment_type'] = df['employment_type'].fillna('unknown')
df['employment_type'].replace(['Full-time', 'Part-time','Contract', 'Temporary', 'Other', 'unknown'],
                        [0, 1, 2, 3, 4, 5], inplace=True)
# Unknown for missing required_experience, then replace the category value to numeric 
df['required_experience'] = df['required_experience'].fillna('unknown')
df['required_experience'].replace(['Not Applicable', 'Mid-Senior level','Associate', 'Entry level', 'Executive', 'Director', 'Internship',
                          'unknown'], [0, 1, 2, 3, 4, 5, 6, 7], inplace=True)
# Unknown for missing benefits
df['benefits'] = df['benefits'].fillna('unknown')
# Unknown for missing required_education
df['required_education'] = df['required_education'].fillna('unknown')
df_re = pd.get_dummies(df, columns=['required_education'])
df_re_dummy = df_re.iloc[:,17:].to_numpy()
# Unknown for missing industry
df['industry'] = df['industry'].fillna('unknown')
df_industry = pd.get_dummies(df, columns=['industry'])
df_industry_dummy = df_industry.iloc[:,17:].to_numpy()
# Unknown for missing function
df['function'] = df['function'].fillna('unknown')
df_function = pd.get_dummies(df, columns=['function'])
df_function_dummy = df_function.iloc[:,17:].to_numpy()
required_experience_c = utils.to_categorical(df['required_experience'])
employment_type_c = utils.to_categorical(df['employment_type'])
#Concatenate all numerical variables and categoralical variables
numeric =  ['salary_lower'] + ['salary_upper'] + ['telecommuting'] + ['has_company_logo'] + ['has_questions']
numeric_cate = np.concatenate((df_re_dummy,df_industry_dummy,df_function_dummy,df[numeric].to_numpy()), axis=1)

In [None]:
#Make a copy of the orginal df, use job id as index 
job = df.copy()
job.index = job['job_id']
del job['job_id']
job.drop(['salary_range'], axis=1, inplace=True)

In [None]:
job.head(3)

Unnamed: 0_level_0,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,salary_lower,salary_upper
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
16993,Senior Software Engineer (PHP),"US, CA, Santa Monica",TriNet Cloud,Do you want to be part of a services company w...,Senior Software Engineer / Direct Hire OR opt...,Experience with PHP or Ruby on Rails.Experienc...,"TriNet Cloud offers competitive salaries, comp...",0,1,1,0,1,Bachelor's Degree,Computer Software,Engineering,0,85000,150000
10955,DL1 1LA Warehouse Apprenticeship Available Und...,"GB, , Durham",unknown,Established on the principles that full time e...,This is fantastic opportunity for someone want...,Government funding is only available for 16-18...,Future Prospects,0,1,1,5,7,unknown,unknown,unknown,0,-1,-1
1975,Development Manager,"US, CA, Woodland Hills",Haven Hills,Envision Consulting conducts retained searches...,"HAVEN HILLS, INC. Haven Hills provides safety ...",Minimum of BA in marketing or related field or...,Full-time exempt salaried position ranging fro...,0,1,1,0,1,Bachelor's Degree,Nonprofit Organization Management,Marketing,0,52000,57000


In [None]:
# Let's shuffle things... 
shuffled_indices= np.arange(job.shape[0])
np.random.shuffle(shuffled_indices)

text_info = (job.title + job.location + job['company_profile'] + job.description + job.requirements + job. benefits).to_numpy()
label = np.where(job['fraudulent']==1,1,0)
label = label.reshape(17879,1)
print(label[:1800].shape)

print(text_info)
text_info = text_info[shuffled_indices]
numeric_cate = numeric_cate[shuffled_indices]
label = label[shuffled_indices]
print(text_info)

(1800, 1)
["Senior Software Engineer (PHP)US, CA, Santa MonicaDo you want to be part of a services company with both a proven record of success and planned growth? TriNet is a leading provider of a comprehensive human resources solution for small to medium-sized businesses. We enhance business productivity as a human resources partner, managing HR so clients can focus on operating and growing their core business. Our HR solution includes payroll processing, human capital consulting, employment law compliance and employee benefits, including health and retirement plans and workers’ compensation. Our services are delivered by our expert teams of HR professionals and enabled by our proprietary, cloud-based technology platform, which allows clients and employees to efficiently conduct HR transactions anytime and anywhere.TriNet partners with more than 8,800 clients and 230,000+ employees. We’ve been on the Inc. 500|5000, a list of the fastest-growing privately-held U.S. companies for six c

#*Define / Train Our Fake Review Detector*

Just like SHAP, model input needs to be numeric (it can't work with strings). So, the input layer to our model needs to be numeric sequences. 

In [None]:
text_vectorization = keras.layers.TextVectorization(
    max_tokens=2000, 
    output_mode="int", # This is requesting integer encodings (which means we'll have a sequence of integers),
    #output_sequence_length=700 # Up to the first 600 words.
)

text_vectorization.adapt(text_info)

process_text = text_vectorization(text_info)

Now we can build our model... by the way, just demonstrating that you can effectively use a 1D Convolution on text sequences too! This topology gets us up to about 90% accuracy.

In [None]:
def build_model():
    
    input = keras.Input(shape=(2078))
    x = layers.Embedding(input_dim=2000,output_dim=8,input_length=784, mask_zero=True)(input)
    x = layers.Conv1D(filters=40,kernel_size=10,activation="relu")(x)
    x = layers.MaxPool1D(pool_size=2,strides=2)(x)
    x = layers.GlobalAveragePooling1D()(x) 
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(10,activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    output = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=input,outputs=output)

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
    return model

model = build_model()

#keras.utils.plot_model(model,show_shapes=True)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="DenseNet_for_LIME.keras",
        save_best_only=True,
        monitor="val_loss")
]

history = model.fit(process_text[:14500], label[:14500], validation_split=0.2, epochs=9, batch_size=1,callbacks=callbacks)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


Test performance...

In [None]:
model = keras.models.load_model("DenseNet_for_LIME.keras")
test_perf = model.evaluate(process_text[14500:], label[14500:])
print(f'Accuracy in the test set is {test_perf[1]*100:.2f}%.')

Accuracy in the test set is 98.34%.


#*Create Our LIME Explainer*

In [1]:
try:
  import lime
  from lime.lime_text import LimeTextExplainer
except ImportError as error:
  !pip install lime
  import lime
  from lime.lime_text import LimeTextExplainer

class_names=['truthful','deceptive']
explainer=LimeTextExplainer(class_names=class_names)

def new_predict(text):
  vectorized = text_vectorization(text)
  padded = keras.preprocessing.sequence.pad_sequences(vectorized, maxlen=2078,padding='post')
  pred=model.predict(padded)
  pos_neg_preds = []
  for i in pred:
    temp=i[0]
    pos_neg_preds.append(np.array([1-temp,temp])) #I would recommend rounding temp and 1-temp off to 2 places
  return np.array(pos_neg_preds)

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l[K     |█▏                              | 10 kB 22.4 MB/s eta 0:00:01[K     |██▍                             | 20 kB 22.5 MB/s eta 0:00:01[K     |███▋                            | 30 kB 11.3 MB/s eta 0:00:01[K     |████▊                           | 40 kB 9.2 MB/s eta 0:00:01[K     |██████                          | 51 kB 4.7 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 5.6 MB/s eta 0:00:01[K     |████████▎                       | 71 kB 5.7 MB/s eta 0:00:01[K     |█████████▌                      | 81 kB 6.0 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 6.7 MB/s eta 0:00:01[K     |███████████▉                    | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████                   | 112 kB 5.4 MB/s eta 0:00:01[K     |██████████████▎                 | 122 kB 5.4 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 5.4 MB/s eta 0:00:01[K     |████████

And, now we can call our LIME explainer on any given review... 

In [None]:
test_obs = 14600

if label[test_obs]==0:
  print(f'The ground truth label for this observation is "truthful."')
else:
  print(f'The ground truth label for this observation is "deceptive."')

explainer.explain_instance(text_info[test_obs],new_predict).show_in_notebook(text=True)