In [1]:
!pip install nlpaug

In [2]:
import nlpaug.augmenter.word as naw  
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


df_train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin_1')
df_train.isnull().sum()

print(df_train.isnull().sum())

df_train.head()

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')

fig = plt.figure(figsize=(16,8))
sns.histplot(df_train['Sentiment'], color='black', stat='density')
plt.show()



In [4]:
from IPython.display import display

print(df_train['Sentiment'].value_counts())
ky = df_train.groupby('Location')['Sentiment'].agg('value_counts') ## Count when used in agg functionn  gives all the vals but value_counts give sep label vals 

display(df_train.head())

print(df_train.isnull().sum())

country = ['India','Australia','UK','USA']
title = ['India / Australia_Distrib.', 'UK / USA_Distrib.']
fig, axs = plt.subplots(2,1, figsize=(12,8))
ind = 0
for i in range(2):
    axs[i].set_title(title[i])
    axs[i].bar(x = ['P', 'N', 'Neu', 'Ext. P', 'Ext. N'], height = ky[country[ind]].tolist(), label = country[ind])
    axs[i].bar(x = ['P', 'N', 'Neu', 'Ext. P', 'Ext. N'], height = ky[country[ind+1]].tolist(), bottom=True, label = country[ind+1])
    ind+=2
    axs[i].legend()

plt.show()

df_train = df_train.fillna(method = 'ffill', axis=0) ## only the null vals are forward-filled in this case not the entire row itself
df_train = df_train.dropna()
print(df_train['Sentiment'].value_counts())
display(df_train.head())
print(df_train.isnull().sum())

In [5]:
## Non-Contextual Word-Embedding
aug_w2v = naw.WordEmbsAug(
    model_type='glove', model_path='../input/glove6b/glove.6B.200d.txt',
    action="substitute")
aug_w2v.aug_p=0.2

text = 'I am aLik' ## random_synonym_replacement, random_insertion etc 
print(text)
print(aug_w2v.augment(text))

In [6]:
'''Do not use Augmentation for Cross-Validation, so first-split then augment'''
import random 
from pprint import pprint

all_vals = df_train['Sentiment'].value_counts()
all_sen = all_vals.keys().tolist()[1:]
target_val  = int(all_vals.iloc[0])

def augment_text(min_df_train, n_samples = 30 , pr=0.2):
    aug_data = min_df_train.sample(n = n_samples)
    text_check = []
    for i in range(n_samples):
        rnd = random.randint(0, min_df_train.shape[0])
        rnd_text = min_df_train['OriginalTweet'].iloc[rnd]
        aug_text = aug_w2v.augment(rnd_text)
        aug_data['OriginalTweet'].iloc[i] = aug_text
        text_check.append(aug_text)
        
    return aug_data, text_check


for step, min_sen in enumerate(all_sen):
    
    print('Min_Sentiment_taken: ', min_sen)
    min_df_train = df_train[df_train.Sentiment == min_sen]
    
    '''curr_val = int(all_vals[min_sen]) // n_samples = target_val - curr_val''' ## for the sake of complete majority balancing 
    aug_data, text_check = augment_text(min_df_train, n_samples = 100)

    print('Before_Augmentation: ',df_train.shape)
    if step == 0:
        df_train_new = pd.concat([df_train, aug_data], axis = 0)
    else:
        df_train_new = pd.concat([df_train_new, aug_data], axis=0)

    print('After_Augmentation: ', df_train_new.shape)
    print('Few_aug_samples: ')
    print()
    print(text_check[0][:70], '...')
    print(text_check[1][:70], '...')
    print('**'*50)

In [7]:
df = df_train_new.sample(frac=1).reset_index(drop=True)

In [8]:
## Checking _stochasticity

print('df_train_shape: ', df.shape)
print('UserName_stochasticity: ',len(df['UserName'].unique().tolist()))
print('ScreenName_stochasticity: ',len(df['ScreenName'].unique().tolist()))
print('Location_stoch: ',len(df['Location'].unique().tolist()))

df_train_new = df.drop(['UserName','ScreenName'], axis=1)
print(df_train_new.shape)

In [9]:
from sklearn.preprocessing import StandardScaler 

#'''
rem = '-'
k = df_train_new['TweetAt'].tolist()
k = '~~~'.join(k).translate(str.maketrans('', '', rem))
k = k.split('~~~')
df_train_new['TweetAt'] = k
#'''

tweet_std = StandardScaler().fit_transform(np.array(df_train_new['TweetAt'].tolist()).reshape(-1,1))
tweet_std = np.ravel(tweet_std).tolist()
df_train_new['TweetAt'] = tweet_std
df_train_new.head()


In [10]:
import string

rem = string.punctuation
rem = rem.replace('~','')
print('rem_punc: ', rem)

loc_list = df_train_new.Location.tolist()
print(loc_list[:4])
loc_str = ' ~~~'.join(loc_list).lower().translate(str.maketrans("","",rem))
loc_all_list = loc_str.split('~~~')
loc_list = list(set(loc_all_list))
print(loc_list[:4])
print('Loc_List_Length: ',len(loc_list))

inv_loc_dic = dict(enumerate(loc_list))
loc_dic = {val:key for key, val in inv_loc_dic.items()}

inv_label_dic = dict(enumerate(df_train_new.Sentiment.unique().tolist()))
label_dic = {val:key for key, val in inv_label_dic.items()}
print(label_dic)

df_train_new.Location = loc_all_list
df_train_new.head()

In [11]:
plt.figure(figsize=(16,8))
sns.histplot(df_train_new['Sentiment'], kde=False, color= 'black', stat = 'count')
plt.show()

df_train_new.head()

In [12]:
for step, (loc, label) in enumerate(zip(df_train_new.Location, df_train_new.Sentiment)):
    df_train_new['Location'].iloc[step] = loc_dic.get(loc)
    df_train_new['Sentiment'].iloc[step] = label_dic.get(label)

df_train_new.head()


In [15]:

maxx = len(df_train_new.OriginalTweet.iloc[0].split(' '))
for i in df_train_new.OriginalTweet.tolist():
    if len(i.split(' ')) > maxx:
        maxx = len(i.split(' '))
max_seq_len = maxx
print('Max_seq_length: ',max_seq_len)


In [16]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

data_x = df_train_new[['Location', 'TweetAt', 'OriginalTweet']].to_numpy()
data_y = df_train_new['Sentiment'].to_numpy()
print(data_x.shape, data_y.shape)

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, random_state=28)
print(x_train.shape, x_test.shape)
y_train = y_train.astype('int32')
x_train_text = x_train[:,2]
x_train_num = np.delete(x_train, 2, axis = 1)
x_test_text = x_test[:,2]
x_test_num = np.delete(x_test, 2, axis = 1)

## you can explictly pass the Vocabulary_Data
tweet_vect = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_seq_len)#vocabulary=vocab_data 
tweet_vect.adapt(x_train_text)
inv_tweet_dic = dict(enumerate(tweet_vect.get_vocabulary()))
tweet_dic = {value:key for key, value in inv_tweet_dic.items()}
print('Tweet_Dic: ',len(list(tweet_dic)))

print(x_train_text.shape, x_train_num.shape, x_test_text.shape, x_test_num.shape)

In [17]:
from sklearn.utils import class_weight
from pprint import pprint

class_weights = class_weight.compute_sample_weight('balanced', np.unique(y_train), y_train)
class_weights_dict = dict(enumerate(class_weights))

pprint(class_weights_dict)

sample_weights = np.ones(shape = (y_train.shape[0],))
for i in range(5):
    sample_weights[y_train == i] = class_weights_dict.get(i)
print(sample_weights.shape)

In [22]:
## Model_Creation
x_train_num = x_train_num.astype('float32')
norm_layer = tf.keras.layers.Normalization(axis=1)
norm_layer.adapt(x_train_num)

inp_text = tf.keras.layers.Input(shape = (), name='text_input', dtype=tf.string) ## while use vectorization layer change the dtype to tf.string
vect_text = tweet_vect(inp_text)
embd_text = tf.keras.layers.Embedding(output_dim = 1024, input_dim = len(list(tweet_dic)))(vect_text)
inp_num = tf.keras.layers.Input(shape = (x_train_num.shape[-1],), name='Num_text')
norm_num = norm_layer(inp_num)
gru_1 = tf.keras.layers.GRU(748, return_sequences=True, recurrent_dropout=0.5, dropout=0.3)(embd_text)
gru_2 = tf.keras.layers.GRU(512, return_sequences=True, recurrent_dropout=0.5)(gru_1)
gru_3 = tf.keras.layers.GRU(512)(gru_2)
num_1 = tf.keras.layers.Dense(512)(norm_num)
num_1 = tf.keras.layers.BatchNormalization()(num_1)
num_1 = tf.keras.layers.Activation('tanh')(num_1)
num_1 = tf.keras.layers.Dropout(0.4)(num_1)
num_2 =tf.keras.layers.Dense(512)(num_1)
num_2 = tf.keras.layers.BatchNormalization()(num_2)
num_2 = tf.keras.layers.Activation('tanh')(num_2)


conc_layer = tf.keras.layers.Concatenate(axis=-1)([gru_3, num_2])
f_dense = tf.keras.layers.Dense(len(list(label_dic)), activation = 'softmax')(conc_layer)

naive_model = tf.keras.models.Model(inputs = [inp_text, inp_num], outputs = f_dense, name='naive_model')
naive_model.compile(loss = 'sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate = 1e-3), metrics=['accuracy'])

tf.keras.utils.plot_model(naive_model, show_shapes=True)

In [23]:
## calbacks 

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 10)

def scheduler(epoch, learning_rate):
    if epoch <= 4:
        return learning_rate
    else:
        return learning_rate * 0.01

lrs = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [24]:
y_train = y_train.astype('float32')

his = naive_model.fit(x = [x_train_text, x_train_num],y = y_train, batch_size = 32, epochs = 1, validation_split=0.2, callbacks=[stop_early, lrs] , sample_weight = sample_weights)

In [25]:
from sklearn.metrics import confusion_matrix

x_test_num = x_test_num.astype('float32')
x_test = [x_test_text, x_test_num]
y_test = y_test.astype('float32')
#naive_model.evaluate(x_test, y_test)

ypred = np.argmax(naive_model.predict(x_test), axis=-1)
print()
print('Confuion_matrix: ')
print(confusion_matrix(y_test, ypred))

In [26]:
from pprint import pprint
import random

def predict(seed):
    print('Prediction: ')
    print()
    yhat = np.argmax(naive_model([np.array([x_train_text[seed]]).reshape(1,-1), x_train_num[seed].reshape(1,-1)]), axis=-1)
    print('Input_Sentence: ')
    print()
    pprint(x_train_text[seed])
    print()
    print('Actual_Result: ')
    print(inv_label_dic.get(y_train[seed]))
    print()
    print('Predicted_Result: ')
    print(inv_label_dic.get(yhat[-1]))


for _ in range(5):
    seed = random.randint(0, x_train_text.shape[0])
    predict(seed)
    print('**' *50)