In [None]:
!pip install spacy
!spacy download en_core_web_sm
!pip install tensorflow
!pip install nltk
!pip install transformers
!pip install keras-tcn
from IPython.display import clear_output
clear_output()

In [None]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.utils import resample
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_excel('train_data.xlsx')
df.head()

Unnamed: 0,text,intent
0,Hi,greeting
1,How are you,greeting
2,Good morning,greeting
3,Good afternoon,greeting
4,Good evening,greeting


In [None]:
from keras.models import Model
from tcn import TCN
from keras.layers import Input, Embedding, SpatialDropout1D, Dense, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Dropout
import keras

In [None]:
def tcn_model(kernel_size = 3, activation='relu', input_dim = None, 
                   output_dim=300, max_length = None, emb_matrix = None):
    
    inp = Input( shape=(max_length,))
    x = Embedding(input_dim=input_dim, 
                  output_dim=output_dim, 
                  input_length=max_length,
                  # Assign the embedding weight with word2vec embedding marix
                  weights = None,
                  # Set the weight to be not trainable (static)
                  trainable = False)(inp)
    
    x = SpatialDropout1D(0.1)(x)
    
    x = TCN(128,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn1')(x)
    x = TCN(64,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn2')(x)
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(max(y_train)+1, activation="softmax")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile( loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

In [None]:
encoded_label = {}
labels = df['intent'].unique()
for i in range(len(labels)):
  encoded_label[labels[i]] = i
print(encoded_label)  

{'greeting': 0, 'goodbye': 1, 'creator': 2, 'name': 3, 'hours': 4, 'number': 5, 'course': 6, 'fees': 7, 'location': 8, 'hostel': 9, 'infrastructure': 10, 'event': 11, 'document': 12, 'syllabus': 13, 'library': 14, 'canteen': 15, 'menu': 16, 'placement': 17, 'csehod': 18, 'principal': 19, 'admission': 20, 'facilities': 21, 'college intake': 22, 'uniform': 23, 'random': 24, 'swear': 25, 'salutation': 26, 'task': 27, 'ragging': 28, 'hod': 29, 'transport': 30, 'scholarship': 31, 'faculty': 32, 'student portal': 33, 'faculty portal': 34, 'sports': 35, 'fest': 36, 'other': 37, 'university affiliation': 38, 'medical ': 39, 'college type': 40, 'notice': 41}


In [None]:
decoded_label = {}
for key in encoded_label:
  decoded_label[encoded_label[key]] = key
print(decoded_label)  

{0: 'greeting', 1: 'goodbye', 2: 'creator', 3: 'name', 4: 'hours', 5: 'number', 6: 'course', 7: 'fees', 8: 'location', 9: 'hostel', 10: 'infrastructure', 11: 'event', 12: 'document', 13: 'syllabus', 14: 'library', 15: 'canteen', 16: 'menu', 17: 'placement', 18: 'csehod', 19: 'principal', 20: 'admission', 21: 'facilities', 22: 'college intake', 23: 'uniform', 24: 'random', 25: 'swear', 26: 'salutation', 27: 'task', 28: 'ragging', 29: 'hod', 30: 'transport', 31: 'scholarship', 32: 'faculty', 33: 'student portal', 34: 'faculty portal', 35: 'sports', 36: 'fest', 37: 'other', 38: 'university affiliation', 39: 'medical ', 40: 'college type', 41: 'notice'}


In [None]:
X_train = np.array(tokenizer.batch_encode_plus(list(df['text']), return_attention_mask = False, padding = 'max_length',max_length=15, truncation = True, add_special_tokens = False )['input_ids'])
y_train = np.array(list(df['intent'].map(encoded_label)))

In [None]:
X_train.shape, y_train.shape

((863, 15), (863,))

In [None]:
model = tcn_model(input_dim=tokenizer.vocab_size, max_length=len(X_train[0]) )
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 15, 300)      8698800     ['input_6[0][0]']                
                                                                                                  
 spatial_dropout1d_4 (SpatialDr  (None, 15, 300)     0           ['embedding_5[0][0]']            
 opout1D)                                                                                         
                                                                                                  
 tcn1 (TCN)                     (None, 15, 128)      400256      ['spatial_dropout1d_4[0][0]

In [None]:
model.fit(X_train, y_train, epochs = 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f9c626eb580>

In [None]:
df_test = pd.read_excel('train_data.xlsx')
df_test.head()

Unnamed: 0,text,intent
0,Hi,greeting
1,How are you,greeting
2,Good morning,greeting
3,Good afternoon,greeting
4,Good evening,greeting


In [None]:
X_test = np.array(tokenizer.batch_encode_plus(list(df_test['text']), return_attention_mask = False, padding = 'max_length',max_length=15, truncation = True, add_special_tokens = False )['input_ids'])
y_test = np.array(list(df_test['intent'].map(encoded_label)))

In [None]:
X_test.shape, y_test.shape

((863, 15), (863,))

In [None]:
model.evaluate(X_test, y_test)



[0.08947745710611343, 0.9849362969398499]

In [None]:
#Save 
model.save('college_search_bar.h5')

In [None]:
#Load
import tcn
import tensorflow as tf
model1 = tf.keras.models.load_model(
       ("/content/college_search_bar.h5"),
       custom_objects={'TCN':tcn.TCN}
)