# Load Data

In [1]:
import pandas as pd

data_path = 'data/data.csv'
all_data = pd.read_csv(data_path, sep=',')
pd.set_option('max_rows', 30)

In [2]:
class_dict = {1:'Company',
              2:'EducationalInstitution',
              3:'Artist',
              4:'Athlete',
              5:'OfficeHolder',
              6:'MeanOfTransportation',
              7:'Building',
              8:'NaturalPlace',
              9:'Village',
              10:'Animal',
              11:'Plant',
              12:'Album',
              13:'Film',
              14:'WrittenWork'}

In [3]:
all_data['Class_name'] = all_data['Class'].map(class_dict)
all_data

Unnamed: 0,Class,Name,Class_name
0,1,E. D. Abbott Ltd,Company
1,1,Schwan-Stabilo,Company
2,1,Q-workshop,Company
3,1,Marvell Software Solutions Israel,Company
4,1,Bergan Mercy Medical Center,Company
5,1,The Unsigned Guide,Company
6,1,Rest of the world,Company
7,1,Globoforce,Company
8,1,Rompetrol,Company
9,1,Wave Accounting,Company


# Remove Annotations (anything between parenthesis)

In [4]:
import re
all_data['Name'] = [re.sub(r'\([^)]*\)', '', x) for x in all_data['Name']]
all_data

Unnamed: 0,Class,Name,Class_name
0,1,E. D. Abbott Ltd,Company
1,1,Schwan-Stabilo,Company
2,1,Q-workshop,Company
3,1,Marvell Software Solutions Israel,Company
4,1,Bergan Mercy Medical Center,Company
5,1,The Unsigned Guide,Company
6,1,Rest of the world,Company
7,1,Globoforce,Company
8,1,Rompetrol,Company
9,1,Wave Accounting,Company


# Load USE

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tf_sentencepiece

tf.logging.set_verbosity(tf.logging.ERROR)
module_url = 'module_Multi_Large/'

g = tf.Graph()
with g.as_default():
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    embed = hub.Module(module_url)
    encoding_tensor = embed(similarity_input_placeholder)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

sess = tf.Session(graph=g)
sess.run(init_op)

# Encode

In [6]:
from tqdm import tqdm

def embed_df(df, col):
    message_embedding = []
    size_batch = 200
    data_size = len(df[col])
    num_batches = int(data_size/size_batch)+1
    for x in tqdm(range(num_batches)):
        samples = list(df[col][(x*size_batch):((x+1)*size_batch)])
        if type(samples) is pd.core.series.Series:
            message_embedding.extend(sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: [samples]}))
        else:
            message_embedding.extend(sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: samples}))

    df[col+'_emb'] = message_embedding
    
    return df

In [7]:
import pickle

load = True
file_path = 'complete_data.pkl'

if load == True:
    with open(file_path, 'rb') as fl:
        all_data = pickle.load(fl)
else:
    columns = ['Name']
    for col in columns:
        all_data = embed_df(all_data, col)
    
    with open(file_path, 'wb') as fs:
        pickle.dump(all_data , fs)

# Check Data Distribution

In [8]:
all_data['Class'].value_counts().sort_values(ascending=False)

1     40000
2     40000
3     40000
4     40000
5     40000
6     40000
7     40000
8     40000
9     40000
10    40000
11    40000
12    40000
13    40000
14    28787
Name: Class, dtype: int64

# Split into Train, Validate and Test (Stratified)

In [9]:
from torch.utils.data import Dataset
import torch
import torch.utils.data as data

class my_dataset(Dataset):
    def __init__(self, data):
        input_ = [torch.tensor(x).float() for x in data['Name_emb'].values]
        label_ = [torch.tensor(x).long() for x in data['Class'].values]
        self.samples = list(zip(input_, label_))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [10]:
from sklearn.model_selection import train_test_split

X = all_data.drop('Class', axis=1)
y = all_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5)

train = pd.concat([X_train, y_train.reindex(X_train.index)], axis=1)
validate = pd.concat([X_val, y_val.reindex(X_val.index)], axis=1)
test = pd.concat([X_test, y_test.reindex(X_test.index)], axis=1)

trainset = my_dataset(train)
validateset = my_dataset(validate)
testset = my_dataset(test)
print('Dataset Length: \n -train (90%%):\t\t %d \n -validate (10%%):\t %d \n -test (10%%):\t\t %d' % (len(trainset), len(validateset), len(testset)))

Dataset Length: 
 -train (90%):		 439029 
 -validate (10%):	 54879 
 -test (10%):		 54879


In [17]:
batch_size = 5000

trainloader = data.DataLoader(trainset, batch_size=batch_size, num_workers=0, shuffle=True)
validateloader = data.DataLoader(validateset, batch_size=batch_size, num_workers=0, shuffle=True)
testloader = data.DataLoader(testset, batch_size=batch_size, num_workers=0, shuffle=True)

# Define Network

In [25]:
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self, EMBED_DIM, CLASS_DIM):
        super(Net, self).__init__()
        self.number_neurons = 500
        self.fc1 = nn.Linear(EMBED_DIM, self.number_neurons)
        
        self.fc2 = nn.Sequential(nn.BatchNorm1d(self.number_neurons),
                                 nn.Linear(self.number_neurons, self.number_neurons),
                                 nn.Dropout(0.3, inplace=True),
                                 
                                 nn.BatchNorm1d(self.number_neurons),
                                 nn.Linear(self.number_neurons, self.number_neurons),
                                 nn.Dropout(0.3, inplace=True)
                                )
        
        self.fc3 = nn.Sequential(
                                 nn.BatchNorm1d(self.number_neurons),
                                 nn.Linear(self.number_neurons, CLASS_DIM)
                                )
        
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return (x)
    
EMBED_DIM, CLASS_DIM = trainloader.dataset[0][0].shape[0], len(class_dict)+1

net = Net(EMBED_DIM, CLASS_DIM)
net.to(device)

Net(
  (fc1): Linear(in_features=512, out_features=500, bias=True)
  (fc2): Sequential(
    (0): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=500, out_features=500, bias=True)
    (2): Dropout(p=0.3, inplace=True)
    (3): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=500, out_features=500, bias=True)
    (5): Dropout(p=0.3, inplace=True)
  )
  (fc3): Sequential(
    (0): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=500, out_features=15, bias=True)
  )
)

# Define Evaluate Function

In [26]:
def evaluate(net, data_loader):

    loss = []
    accuracy = []

    y_true = []
    y_pred = []
    
    for input_labels in data_loader:
        inputs, labels = input_labels[0].to(device), input_labels[1].to(device)

        outputs = net(inputs)

        loss.append(criterion(outputs, labels).data.cpu().numpy())
        outputs = outputs.to('cpu').detach().numpy() 
        labels = labels.to('cpu').detach().numpy() 

        predict = [np.argmax(x) for x in outputs]
        labels = np.array([int(x) for x in labels])

        accuracy.append(list(predict==labels).count(True)/len(predict))

        y_true.extend(labels)
        y_pred.extend(predict)
        

    print('Acc: %.2f'% np.mean(accuracy))
    print('Loss: %.2f'% np.mean(loss))
    return y_true, y_pred

In [27]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.5, momentum=0.5)

In [28]:
display_n_loss = 3
display_n_loss = int(len(trainset)/(display_n_loss * batch_size))

EPOCHS = 100
display_n_epoch = 10

for epoch in range(EPOCHS):  # loop over the dataset multiple times

    running_loss = 0.0
    net.train()
    print('--------------- Epoch', epoch)
    for i, input_labels in enumerate(trainloader, 0):
        inputs, labels = input_labels[0].to(device), input_labels[1].to(device)
    
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
        if i % display_n_loss == display_n_loss-1:    # print every N mini-batches
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / display_n_loss))
            running_loss = 0.0
            
    if epoch % display_n_epoch == display_n_epoch-1:
        net.eval()

        print('-> Evaluate') 
        print('\tTrain:')
        evaluate(net, trainloader)
        print('\tValidate:')
        evaluate(net, validateloader)
        print('\tTest:')
        evaluate(net, testloader)

print('Finished Training')

--------------- Epoch 0
[1,    29] loss: 2.376
[1,    58] loss: 2.235
[1,    87] loss: 2.220
--------------- Epoch 1
[2,    29] loss: 2.209
[2,    58] loss: 2.205
[2,    87] loss: 2.200
--------------- Epoch 2
[3,    29] loss: 2.194
[3,    58] loss: 2.193
[3,    87] loss: 2.186
--------------- Epoch 3
[4,    29] loss: 2.178
[4,    58] loss: 2.178
[4,    87] loss: 2.176
--------------- Epoch 4
[5,    29] loss: 2.171
[5,    58] loss: 2.169
[5,    87] loss: 2.172
--------------- Epoch 5
[6,    29] loss: 2.166
[6,    58] loss: 2.164
[6,    87] loss: 2.164
--------------- Epoch 6
[7,    29] loss: 2.159
[7,    58] loss: 2.161
[7,    87] loss: 2.162
--------------- Epoch 7
[8,    29] loss: 2.156
[8,    58] loss: 2.157
[8,    87] loss: 2.158
--------------- Epoch 8
[9,    29] loss: 2.151
[9,    58] loss: 2.155
[9,    87] loss: 2.157
--------------- Epoch 9
[10,    29] loss: 2.150
[10,    58] loss: 2.152
[10,    87] loss: 2.153
-> Evaluate
	Train:
Acc: 0.66
Loss: 2.16
	Validate:
Acc: 0.64
Loss:

In [29]:
print('-> Evaluate') 
net.eval()
print('\tTrain:')
evaluate(net, trainloader)
print('\tValidate:')
evaluate(net, validateloader)
print('\tTest:')
evaluate(net, testloader)
print()

-> Evaluate
	Train:
Acc: 0.76
Loss: 2.06
	Validate:
Acc: 0.66
Loss: 2.16
	Test:
Acc: 0.66
Loss: 2.15



# Statistics on Test data

In [31]:
from pandas_ml import ConfusionMatrix

y_true, y_pred = evaluate(net, testloader)
cm = ConfusionMatrix(y_true, y_pred)

Acc: 0.66
Loss: 2.15


In [32]:
pd.set_option('precision', 3)

cm_df_report = cm.classification_report
classes_name = list(class_dict.values())
classes_name.append('Avg_Total')

cm_df_report.index = classes_name
cm_df_report

Unnamed: 0,precision,recall,F1_score,support
Company,0.707,0.716,0.711,4000
EducationalInstitution,0.947,0.953,0.95,4000
Artist,0.403,0.312,0.352,4000
Athlete,0.438,0.592,0.503,4000
OfficeHolder,0.507,0.455,0.48,4000
MeanOfTransportation,0.908,0.885,0.896,4000
Building,0.851,0.833,0.842,4000
NaturalPlace,0.81,0.796,0.803,4000
Village,0.679,0.835,0.749,4000
Animal,0.69,0.659,0.674,4000


In [33]:
cm_od = cm.stats()
cm_od['class'].columns = class_dict.values()
cm_od['class']

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2


Unnamed: 0,Company,EducationalInstitution,Artist,Athlete,OfficeHolder,MeanOfTransportation,Building,NaturalPlace,Village,Animal,Plant,Album,Film,WrittenWork
Population,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0
P: Condition positive,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,2879.0
N: Condition negative,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,52000.0
Test outcome positive,4046.0,4022.0,3098.0,5413.0,3590.0,3901.0,3912.0,3933.0,4918.0,3820.0,4620.0,4398.0,3236.0,1972.0
Test outcome negative,50833.0,50857.0,51781.0,49466.0,51289.0,50978.0,50967.0,50946.0,49961.0,51059.0,50259.0,50481.0,51643.0,52907.0
TP: True Positive,2862.0,3810.0,1248.0,2369.0,1821.0,3541.0,3331.0,3186.0,3338.0,2636.0,3157.0,2381.0,1620.0,1082.0
TN: True Negative,49695.0,50667.0,49029.0,47835.0,49110.0,50519.0,50298.0,50132.0,49299.0,49695.0,49416.0,48862.0,49263.0,51110.0
FP: False Positive,1184.0,212.0,1850.0,3044.0,1769.0,360.0,581.0,747.0,1580.0,1184.0,1463.0,2017.0,1616.0,890.0
FN: False Negative,1138.0,190.0,2752.0,1631.0,2179.0,459.0,669.0,814.0,662.0,1364.0,843.0,1619.0,2380.0,1797.0
"TPR: (Sensitivity, hit rate, recall)",0.716,0.953,0.312,0.592,0.455,0.885,0.833,0.796,0.835,0.659,0.789,0.595,0.405,0.376


# Confusion Matrix

In [34]:
cm_df = cm.to_dataframe()
cm_df.columns = class_dict.values()
cm_df.index = class_dict.values()

In [35]:
import seaborn as sns
from matplotlib.colors import ListedColormap

my_cmap = ListedColormap(sns.color_palette("BuGn", 14).as_hex())
s = cm_df.style.background_gradient(cmap=my_cmap)
s

Unnamed: 0,Company,EducationalInstitution,Artist,Athlete,OfficeHolder,MeanOfTransportation,Building,NaturalPlace,Village,Animal,Plant,Album,Film,WrittenWork
Company,2862,44,58,42,41,93,138,96,137,72,66,214,65,72
EducationalInstitution,43,3810,4,8,20,14,59,8,9,5,2,5,6,7
Artist,58,3,1248,1409,787,39,35,61,153,27,37,62,61,20
Athlete,20,1,769,2369,588,9,13,52,98,21,14,9,25,12
OfficeHolder,24,2,690,1200,1821,25,17,26,112,14,10,15,38,6
MeanOfTransportation,106,6,43,34,30,3541,19,26,46,36,19,59,26,9
Building,139,117,24,20,35,24,3331,90,56,15,19,49,46,35
NaturalPlace,82,2,34,49,35,15,50,3186,292,61,73,55,50,16
Village,71,3,44,91,89,12,27,111,3338,62,64,32,49,7
Animal,44,1,10,27,11,13,10,55,103,2636,1000,35,39,16


# Save Model

In [36]:
net_path = 'trained_net.pt'
torch.save(net, net_path)

  "type " + obj.__name__ + ". It won't be checked "


# Test Run

In [37]:
### Load model
net_path = 'trained_net.pt'
net = torch.load(net_path)

net.eval()

input_names = ['Hello', 'aaaaaaaaaaaaaaa', 'Streetbee']
message_embedding = sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: input_names})

# Uncomment 2 lines below for sample:
# input_names = all_data['Name'].iloc[0:10]
# message_embedding = all_data['Name_emb'].iloc[0:10]

tensor_input = torch.Tensor(message_embedding).to(device)
tensor_input
predicted_class_score = [(int(torch.argmax(x)), max(x)) for x in net(tensor_input)]

print('{0:<40} {1:<20} {2:<10}'.format('Name', 'Class', 'Score'))
print()
for enum, x in enumerate(predicted_class_score):
    print('{0:<40} {1:<20} {2:<10.2f}'.format(input_names[enum], class_dict[x[0]], x[1]))

Name                                     Class                Score     

Hello                                    Album                1.00      
aaaaaaaaaaaaaaa                          Album                1.00      
Streetbee                                Company              1.00      
