### Topic Classification using BERT
https://github.com/sandeepsainath/BERT-Classifier-Rasa

BertCLassifier(): is a scikit-learn wrapper to finetune BERT model for text and token sequence tasks based on huggingface pytorch port. 
- Includes configurable MLP (Multi-layer Perceptron: a feedforward ANN that generates a set out outputs from a set of inputs, uses backpropagation for training) as final classifier/regressor for text and text pair tasks.
- Includes token sequence classifier for NER, PoS, and chunking tasks.
- Includes SciBERT and BioBERT pretrained models for scientific and biomedical domains.

## **Installing the Dependencies**

In [None]:
!pip install git+https://github.com/charles9n/bert-sklearn.git

In [None]:
pip install transformers

In [None]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Measuring CPU time 
import time

#Plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from scipy import special
import plotly.express as px

py.offline.init_notebook_mode(connected = True)
import scipy.stats as stats

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from bert_sklearn import BertClassifier
from bert_sklearn import load_model

import transformers
from transformers import TFBertForSequenceClassification, BertTokenizer

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Printing the requirements 
print("=======================Library Versions=================================")
print(f'Numpy Version: {np.__version__}')
print(f'Pandas Version: {pd.__version__}')
print(f'Plotly Version: {py.__version__}')
print(f'Sklearn Version: {sklearn.__version__}')
print(f'Transformers Version: {transformers.__version__}')

Numpy Version: 1.19.5
Pandas Version: 1.1.5
Plotly Version: 4.4.1
Sklearn Version: 1.0.1
Transformers Version: 4.14.1


### Exploratory Data Analysis

In [None]:
df = pd.read_csv('dummydata.csv')
df = df.rename({'Segment': 'Topic'}, axis = 1)
df.head(20)

In [None]:
fig = px.histogram(df,x='Country', title = "Distribution of Language")
fig.update_traces(marker_color='mediumpurple')
fig.show(renderer='colab')

In [None]:
def get_analysis_values(dataframe, columnname):
    print("============================================Exploratory Data Analysis=====================================================")
    print(f'Shape of the dataframe is {dataframe.shape}')
    print()
    print(dataframe.info())
    print()
    print('-----Data labels Distribution----')
    print(dataframe[columnname].value_counts())
    print()
    fig = px.histogram(dataframe,x=columnname, title = "Distribution of Topics")
    fig.show(renderer='colab')
    print(df.describe())
    

In [None]:
get_analysis_values(df,'Topic')

In [None]:
import statistics
print(statistics.mean(df.Topic.value_counts().tolist()))

2786.5


In [None]:
def minimum_label_count(dataframe, columnname, min_counts):
    label_count = dataframe[columnname].value_counts()
    filtered_topics = label_count[label_count <= min_counts].index
    topics_not_in_filtered_topics = label_count[label_count > min_counts].index
    if len(topics_not_in_filtered_topics) > 0:
      print(f'The following topics do not meet the observations threshold {min_counts} and will be removed {list(filtered_topics)}') 
      df = dataframe[~dataframe[columnname].isin(filtered_topics).values]
      if not list(filtered_topics):
        print('Enough observations for classification :)')
    
    print(f'New Shape of the Dataframe {df.shape}')
    fig = px.histogram(df,x=columnname, title = "Distribution of Topics After Minimum Values")
    fig.update_traces(marker_color='mediumpurple')
    fig.show(renderer='colab')

    return df


In [None]:
df = minimum_label_count(df,'Topic',2786)

In [None]:
#Label Encoding the unique topic values
label_encoder = preprocessing.LabelEncoder()
df['Topic'] = label_encoder.fit_transform(df['Topic'])
#df = df.groupby('Topic').head(400).reset_index(drop=True)
df.head()

In [None]:
#To order elements in an array in ascending order for forming the confusion matrix == in case of multi-lingual dataset.
def selection_sort(x):
    for i in range(len(x)):
        swap = i + np.argmin(x[i:])
        (x[i], x[swap]) = (x[swap], x[i])
    return x

In [None]:
seed = 42

#Train-Test Split 
X = (np.array(df['Keyword']))
y = (np.array(df['Topic']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)
print(f'Train Dataset Shape {X_train.shape}\n Test Dataset Shape {X_test.shape}')

In [None]:
split_train = pd.DataFrame({'X_train': X_train, 'y_train': y_train})
split_test = pd.DataFrame({'X_test': X_test, 'y_test': y_test})
split_test['y_test'] = label_encoder.inverse_transform(y_test)
split_test.head(10)

In [None]:
fig = px.histogram(split_test,x='y_test', title = "Distribution of Topics in Test Set")
fig.show(renderer='colab')

In [None]:
start = time.process_time()
# your code here    
model = BertClassifier()
model.epochs = 1
model.validation_fraction = 0.05
model.learning_rate = 2e-5
model.max_seq_length = 128

print(f'\n {model}\n')

#Fit the model 
history = model.fit(X_train,y_train)

#Make Predictions 
y_pred = model.predict(X_test)
print()
print(time.process_time() - start)

In [None]:
#Predictions on the train set 
y_pred_train = model.predict(X_train)

Predicting:   0%|          | 0/559 [00:00<?, ?it/s]

In [None]:
import datetime
str(datetime.timedelta(seconds=1167.469936122))

In [None]:


#Model's Performance dataset
dataset = ['name of the dataset']
language = ['Multi-lingual','Multi-lingual']
type = ['full', 'avg topics']
epochs = 1,1
sample = 0.80,0.80
train_accuracy = [98.4, 99.68]
test_accuracy = [97.7, 99.59] 
loss_val = [1.21,0.024]
time_taken = ['0:30:18', '0:19:27']

re_df = pd.DataFrame({'Dataset_Name': dataset, 'language': language, 'type':type, 'epochs': epochs, 'split': sample, 'train_acc': train_accuracy,'test_acc': test_accuracy, 'loss_val': loss_val, 'time_taken': time_taken})
re_df

In [None]:
print(f'Mean Squared Error {mean_squared_error(y_test,y_pred)}')
print(f'Train Set Accuracy Score: {accuracy_score(y_train, y_pred_train)*100}')
print(f'Test Set Accuracy Score {accuracy_score(y_pred, y_test)*100}')
print("=====================================================================")
print(f'Classification Report')
print(classification_report(y_test,y_pred))

In [None]:
def plot_confusion_matrix(model, true, predicted, xfig, yfig, title):
    fig,ax=plt.subplots(figsize=(xfig,yfig))
    #plt.figure(figsize=(xfig,yfig))
    sns.heatmap(confusion_matrix(predicted, true),annot=True,fmt='d',cmap="PiYG")
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.title(title)
    topic = label_encoder.inverse_transform(df.Topic.unique())
    ax.xaxis.set_ticklabels(topic) 
    ax.yaxis.set_ticklabels(topic)
    plt.xticks(rotation=90, ha='right')
    plt.yticks(rotation=360, ha='right')
    plt.show();

In [None]:
plot_confusion_matrix(model,y_test,y_pred,15,10,"Confusion Matrix of Test Set");

In [None]:
plot_confusion_matrix(model,y_train,y_pred_train,15,8, "Confusion Matrix on Train Set");

In [None]:
#Predicting new topics with keywords as the input
def predict_new_topics(keywords):
  """ Function to predict new topics 
  Parameters:
  Input  (Keywords): Takes in keywords in a list 
  Output (Topics): Maps the corresponding topic and prints a dataframe
  """
  
  predictions = model.predict(keywords)
  topic = label_encoder.inverse_transform(predictions)

  for key, topicpred in zip(keywords,topic):
    predict_df = pd.DataFrame()
    predict_df['keyword'] = keywords
    predict_df['pred_topic'] = topic
  return predict_df


In [None]:
keywords = ["enter your keywords you want to be classified into"
]
predict_new_topics(keywords)

In [None]:
 #Train Set Results

#Inverse Labelling of the test labels 
y_pred = label_encoder.inverse_transform(y_pred)
y_test = label_encoder.inverse_transform(y_test)


testdf = pd.DataFrame({'Keywords': X_test,'predicted_topics':y_pred,'topics': y_test})
testdf.head(10)

In [None]:
#Inverse Labelling of the train labels
y_pred_train = label_encoder.inverse_transform(y_pred_train)
y_train = label_encoder.inverse_transform(y_train)

traindf = pd.DataFrame({'Keywords': X_train,'predicted_topics':y_pred_train,'topics': y_train})
traindf.head(10)

In [None]:
#Concatenating test and train dfs along rows
result_df = pd.concat([traindf,testdf], axis =0)
result_df.head(20)

In [None]:
#Analysing the results
result_df['condition'] = (result_df['predicted_topics'] != result_df['topics'])
result_df_cond = result_df[result_df.condition]
result_df_cond.head(20)

In [None]:
result_df_cond.shape

In [None]:
def return_top_n_pred_prob_df(n, model, docs, column_name):
  '''
  Function to predict the top n topics for a specific keyword with it's accuracy score 
  Parameters: 
    Input: 
      a) n = Top n topic classes you want 
      b) model = the model you have trained your dataset on
      c) docs = the keywords on which you want to predict the top n topics 
      d) column_name = name of the column in the resultant df which takes in this as it's input for naming it

    Output: A dataframe with keywords and their corresponding topic names with its associated percentage accuracy.
  '''
  predictions = model.predict_proba(docs)
  preds_idx = np.argsort(-predictions, axis = 1) 
  top_n_preds = pd.DataFrame()


  for i in range(n):
    top_n_preds['keywords'] = docs
    top_n_preds[column_name + "_" + '{}'.format(i)] =  [preds_idx[doc][i] for doc in range(len(docs))]
    top_n_preds[column_name + "_" + '{}_prob'.format(i)] = [predictions[doc][preds_idx[doc][i]] for doc in range(len(docs))]
    
    top_n_preds = top_n_preds.rename(columns={'class_name': column_name + ''.format(i)})
    try: top_n_preds.drop(columns=['index', column_name + '_prediction_{}_num'.format(i)], inplace=True) 
    except: pass
  return top_n_preds

In [None]:
topn_topics = return_top_n_pred_prob_df(2, model, result_df_cond['Keywords'], 'topics')

#Forming the column topic as a list to inverse transform 
topics_0 = topn_topics['topics_0'].tolist()
topics_0 = label_encoder.inverse_transform(topics_0)

topics_1 = topn_topics['topics_1'].tolist()
topics_1 = label_encoder.inverse_transform(topics_1)

#Assigning the new converted topic names to the column
topn_topics['topics_0'] = topics_0
topn_topics['topics_1'] = topics_1

#merge it with the original data to get languages 
topn_topics = pd.merge(topn_topics,df, left_index=True, right_index=True)
topn_topics = topn_topics[['keywords', 'topics_0','topics_0_prob','topics_1','topics_1_prob']]

topn_topics.head(20)

In [None]:
topn_topics.to_csv("/content/drive/MyDrive/topn_all_topics.csv", index=False)

In [None]:
#save model to disk
#savefile = '/data/test.bin'
#model.save(savefile)

# load model from disk
#new_model = load_model(savefile)

# predict with new model
#accy = new_model.score(X_test, y_test)
#Loading model from /data/test.bin...

--------------

The topics in our dataset have keyword count varying from 4 to 30K. Therefore, it would be interesting to see how the count of keywords affects the classification of topics. 

Making acroynm `M(T,K)` where T = the number of unique topics and K is the associated number of keywords in it. 

*Use Cases:* 
1. *M(10,100)*
2. *M(10,218)*
3. *M(8,500)*
4. *M(4,1000)*
 

In [None]:
#Results dataframe of different use case 
use_case = ['M(10,100)','M(10,100)','M(10,218)','M(10,218)','M(8,500)','M(8,500)','M(4,1000)','M(4,1000)','All']
epochs = [1,4,1,4,1,4,1,4,1]
lr = ['2e-5','2e-5','2e-5','2e-5','2e-5','2e-5','2e-5','2e-5','2e-5']
f1_score = [0.14,0.87,0.63,0.90,0.87,0.97,0.99,0.99,0.98]
loss_value = [12.26, 2.54,5.73,1.67, 1.43,0.28,0.021,0.001,0.40]
model_perf_df = pd.DataFrame({'use_case': use_case,'epochs':epochs,'lr':lr, 'f1_score':f1_score,'loss_value':loss_value})
model_perf_df.head(10)

In [None]:
fig = px.line(model_perf_df, x="use_case", y="f1_score", color='epochs', title = "f1-score Throughout Different Use Cases",width=800, height=400)
fig.show(renderer='colab')

In [None]:
fig = px.line(model_perf_df, x="use_case", y="loss_value", color='epochs', title = "Loss Value Throughout Different Use Cases",width=800, height=400)
fig.show(renderer='colab')

In [None]:
py.plot(fig, filename = 'Loss Value of Use Cases', auto_open = True)