# Load Data

In [1]:
import pandas as pd

data_path = 'data/data.csv'
all_data = pd.read_csv(data_path, sep=',')
pd.set_option('max_rows', 30)

In [2]:
class_dict = {1:'Company',
              2:'EducationalInstitution',
              3:'Artist',
              4:'Athlete',
              5:'OfficeHolder',
              6:'MeanOfTransportation',
              7:'Building',
              8:'NaturalPlace',
              9:'Village',
              10:'Animal',
              11:'Plant',
              12:'Album',
              13:'Film',
              14:'WrittenWork'}

In [3]:
all_data['Class_name'] = all_data['Class'].map(class_dict)
all_data

Unnamed: 0,Class,Name,Class_name
0,1,E. D. Abbott Ltd,Company
1,1,Schwan-Stabilo,Company
2,1,Q-workshop,Company
3,1,Marvell Software Solutions Israel,Company
4,1,Bergan Mercy Medical Center,Company
5,1,The Unsigned Guide,Company
6,1,Rest of the world,Company
7,1,Globoforce,Company
8,1,Rompetrol,Company
9,1,Wave Accounting,Company


# Remove Annotations (anything between parenthesis)

In [4]:
import re
all_data['Name'] = [re.sub(r'\([^)]*\)', '', x) for x in all_data['Name']]
all_data

Unnamed: 0,Class,Name,Class_name
0,1,E. D. Abbott Ltd,Company
1,1,Schwan-Stabilo,Company
2,1,Q-workshop,Company
3,1,Marvell Software Solutions Israel,Company
4,1,Bergan Mercy Medical Center,Company
5,1,The Unsigned Guide,Company
6,1,Rest of the world,Company
7,1,Globoforce,Company
8,1,Rompetrol,Company
9,1,Wave Accounting,Company


# Load USE

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tf_sentencepiece

tf.logging.set_verbosity(tf.logging.ERROR)
module_url = 'module_Multi_Large/'

g = tf.Graph()
with g.as_default():
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    embed = hub.Module(module_url)
    encoding_tensor = embed(similarity_input_placeholder)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

sess = tf.Session(graph=g)
sess.run(init_op)

# Encode

In [6]:
from tqdm import tqdm

def embed_df(df, col):
    message_embedding = []
    size_batch = 200
    data_size = len(df[col])
    num_batches = int(data_size/size_batch)+1
    for x in tqdm(range(num_batches)):
        samples = list(df[col][(x*size_batch):((x+1)*size_batch)])
        if type(samples) is pd.core.series.Series:
            message_embedding.extend(sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: [samples]}))
        else:
            message_embedding.extend(sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: samples}))

    df[col+'_emb'] = message_embedding
    
    return df

In [7]:
import pickle

load = True
file_path = 'complete_data.pkl'

if load == True:
    with open(file_path, 'rb') as fl:
        all_data = pickle.load(fl)
else:
    columns = ['Name']
    for col in columns:
        all_data = embed_df(all_data, col)
    
    with open(file_path, 'wb') as fs:
        pickle.dump(all_data , fs)

# Check Data Distribution

In [8]:
all_data['Class'].value_counts().sort_values(ascending=False)

1     40000
2     40000
3     40000
4     40000
5     40000
6     40000
7     40000
8     40000
9     40000
10    40000
11    40000
12    40000
13    40000
14    28787
Name: Class, dtype: int64

# Split into Train, Validate and Test (Stratified)

In [9]:
from torch.utils.data import Dataset
import torch
import torch.utils.data as data

class my_dataset(Dataset):
    def __init__(self, data):
        input_ = [torch.tensor(x).float() for x in data['Name_emb'].values]
        label_ = [torch.tensor(x).long() for x in data['Class'].values]
        self.samples = list(zip(input_, label_))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [10]:
from sklearn.model_selection import train_test_split

X = all_data.drop('Class', axis=1)
y = all_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5)

train = pd.concat([X_train, y_train.reindex(X_train.index)], axis=1)
validate = pd.concat([X_val, y_val.reindex(X_val.index)], axis=1)
test = pd.concat([X_test, y_test.reindex(X_test.index)], axis=1)

trainset = my_dataset(train)
validateset = my_dataset(validate)
testset = my_dataset(test)
print('Dataset Length: \n -train (90%%):\t\t %d \n -validate (10%%):\t %d \n -test (10%%):\t\t %d' % (len(trainset), len(validateset), len(testset)))

Dataset Length: 
 -train (90%):		 439029 
 -validate (10%):	 54879 
 -test (10%):		 54879


In [11]:
batch_size = 10000

trainloader = data.DataLoader(trainset, batch_size=batch_size, num_workers=0, shuffle=True)
validateloader = data.DataLoader(validateset, batch_size=batch_size, num_workers=0, shuffle=True)
testloader = data.DataLoader(testset, batch_size=batch_size, num_workers=0, shuffle=True)

# Define Network

In [12]:
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self, EMBED_DIM, CLASS_DIM):
        super(Net, self).__init__()
        self.number_neurons = 1000
        self.fc1 = nn.Sequential(nn.Linear(EMBED_DIM, self.number_neurons),
                                 nn.BatchNorm1d(self.number_neurons))
        self.fc2 = nn.Sequential(nn.Linear(self.number_neurons, self.number_neurons),
                                 nn.BatchNorm1d(self.number_neurons),
                                 nn.Dropout(0.3, inplace=True))
        self.fc3 = nn.Sequential(nn.Linear(self.number_neurons, self.number_neurons),
                                 nn.BatchNorm1d(self.number_neurons),
                                 nn.Dropout(0.3, inplace=True))
        self.fc4 = nn.Sequential(nn.Linear(self.number_neurons, self.number_neurons),
                                 nn.BatchNorm1d(self.number_neurons),
                                 nn.Dropout(0.3, inplace=True))
        self.fc5 = nn.Sequential(nn.Linear(self.number_neurons, CLASS_DIM))
        
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        x = F.leaky_relu(self.fc4(x))
        x = F.softmax(self.fc5(x), dim=1)
        return (x)
    
EMBED_DIM, CLASS_DIM = trainloader.dataset[0][0].shape[0], len(class_dict)+1

net = Net(EMBED_DIM, CLASS_DIM)
net.to(device)

Net(
  (fc1): Sequential(
    (0): Linear(in_features=512, out_features=1000, bias=True)
    (1): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (fc2): Sequential(
    (0): Linear(in_features=1000, out_features=1000, bias=True)
    (1): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.3, inplace=True)
  )
  (fc3): Sequential(
    (0): Linear(in_features=1000, out_features=1000, bias=True)
    (1): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.3, inplace=True)
  )
  (fc4): Sequential(
    (0): Linear(in_features=1000, out_features=1000, bias=True)
    (1): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.3, inplace=True)
  )
  (fc5): Sequential(
    (0): Linear(in_features=1000, out_features=15, bias=True)
  )
)

# Define Evaluate Function

In [13]:
def evaluate(net, data_loader):

    loss = []
    accuracy = []

    y_true = []
    y_pred = []
    
    for input_labels in data_loader:
        inputs, labels = input_labels[0].to(device), input_labels[1].to(device)

        outputs = net(inputs)

        loss.append(criterion(outputs, labels).data.cpu().numpy())
        outputs = outputs.to('cpu').detach().numpy() 
        labels = labels.to('cpu').detach().numpy() 

        predict = [np.argmax(x) for x in outputs]
        labels = np.array([int(x) for x in labels])

        accuracy.append(list(predict==labels).count(True)/len(predict))

        y_true.extend(labels)
        y_pred.extend(predict)
        

    print('Acc: %.2f'% np.mean(accuracy))
    print('Loss: %.2f'% np.mean(loss))
    return y_true, y_pred

In [14]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.5, momentum=0.5)

In [15]:
display_n_loss = 3
display_n_loss = int(len(trainset)/(display_n_loss * batch_size))

EPOCHS = 100
display_n_epoch = 10

for epoch in range(EPOCHS):  # loop over the dataset multiple times

    running_loss = 0.0
    net.train()
    print('--------------- Epoch', epoch)
    for i, input_labels in enumerate(trainloader, 0):
        inputs, labels = input_labels[0].to(device), input_labels[1].to(device)
    
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
        if i % display_n_loss == display_n_loss-1:    # print every N mini-batches
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / display_n_loss))
            running_loss = 0.0
            
    if epoch % display_n_epoch == display_n_epoch-1:    
        net.eval()

        print('-> Evaluate') 
        print('\tTrain:')
        evaluate(net, trainloader)
        print('\tValidate:')
        evaluate(net, validateloader)
        print('\tTest:')
        evaluate(net, testloader)

print('Finished Training')

--------------- Epoch 0
[1,    14] loss: 2.452
[1,    28] loss: 2.265
[1,    42] loss: 2.255
--------------- Epoch 1
[2,    14] loss: 2.246
[2,    28] loss: 2.243
[2,    42] loss: 2.240
--------------- Epoch 2
[3,    14] loss: 2.236
[3,    28] loss: 2.235
[3,    42] loss: 2.235
--------------- Epoch 3
[4,    14] loss: 2.229
[4,    28] loss: 2.230
[4,    42] loss: 2.230
--------------- Epoch 4
[5,    14] loss: 2.224
[5,    28] loss: 2.226
[5,    42] loss: 2.225
--------------- Epoch 5
[6,    14] loss: 2.220
[6,    28] loss: 2.222
[6,    42] loss: 2.222
--------------- Epoch 6
[7,    14] loss: 2.218
[7,    28] loss: 2.217
[7,    42] loss: 2.219
--------------- Epoch 7
[8,    14] loss: 2.215
[8,    28] loss: 2.215
[8,    42] loss: 2.216
--------------- Epoch 8
[9,    14] loss: 2.213
[9,    28] loss: 2.211
[9,    42] loss: 2.215
--------------- Epoch 9
[10,    14] loss: 2.210
[10,    28] loss: 2.210
[10,    42] loss: 2.214
-> Evaluate
	Train:
Acc: 0.61
Loss: 2.21
	Validate:
Acc: 0.59
Loss:

In [25]:
print('-> Evaluate') 
net.eval()
print('\tTrain:')
evaluate(net, trainloader)
print('\tValidate:')
evaluate(net, validateloader)
print('\tTest:')
evaluate(net, testloader)
print()

-> Evaluate
	Train:
Acc: 0.08
Loss: 2.71
	Validate:
Acc: 0.08
Loss: 2.71
	Test:
Acc: 0.08
Loss: 2.71



# Statistics on Test data

In [25]:
from pandas_ml import ConfusionMatrix

y_true, y_pred = evaluate(testloader)
cm = ConfusionMatrix(y_true, y_pred)

Acc: 0.65
Loss: 2.16


In [26]:
pd.set_option('precision', 3)

cm_df_report = cm.classification_report
classes_name = list(class_dict.values())
classes_name.append('Avg_Total')

cm_df_report.index = classes_name
cm_df_report

Unnamed: 0,precision,recall,F1_score,support
Company,0.661,0.731,0.694,4000
EducationalInstitution,0.949,0.957,0.953,4000
Artist,0.392,0.256,0.31,4000
Athlete,0.404,0.702,0.513,4000
OfficeHolder,0.597,0.35,0.441,4000
MeanOfTransportation,0.907,0.868,0.887,4000
Building,0.879,0.823,0.85,4000
NaturalPlace,0.811,0.805,0.808,4000
Village,0.711,0.804,0.755,4000
Animal,0.653,0.705,0.678,4000


In [27]:
cm_od = cm.stats()
cm_od['class'].columns = class_dict.values()
cm_od['class']

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2


Unnamed: 0,Company,EducationalInstitution,Artist,Athlete,OfficeHolder,MeanOfTransportation,Building,NaturalPlace,Village,Animal,Plant,Album,Film,WrittenWork
Population,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0,54879.0
P: Condition positive,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,2879.0
N: Condition negative,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,50879.0,52000.0
Test outcome positive,4422.0,4033.0,2613.0,6944.0,2344.0,3824.0,3744.0,3968.0,4528.0,4313.0,3762.0,4445.0,4736.0,1203.0
Test outcome negative,50457.0,50846.0,52266.0,47935.0,52535.0,51055.0,51135.0,50911.0,50351.0,50566.0,51117.0,50434.0,50143.0,53676.0
TP: True Positive,2922.0,3829.0,1025.0,2807.0,1399.0,3470.0,3291.0,3220.0,3218.0,2818.0,2791.0,2372.0,2115.0,771.0
TN: True Negative,49379.0,50675.0,49291.0,46742.0,49934.0,50525.0,50426.0,50131.0,49569.0,49384.0,49908.0,48806.0,48258.0,51568.0
FP: False Positive,1500.0,204.0,1588.0,4137.0,945.0,354.0,453.0,748.0,1310.0,1495.0,971.0,2073.0,2621.0,432.0
FN: False Negative,1078.0,171.0,2975.0,1193.0,2601.0,530.0,709.0,780.0,782.0,1182.0,1209.0,1628.0,1885.0,2108.0
"TPR: (Sensitivity, hit rate, recall)",0.731,0.957,0.256,0.702,0.35,0.868,0.823,0.805,0.804,0.705,0.698,0.593,0.529,0.268


In [28]:
cm_df = cm.to_dataframe()
cm_df.columns = class_dict.values()
cm_df.index = class_dict.values()

In [29]:
import seaborn as sns
from matplotlib.colors import ListedColormap

my_cmap = ListedColormap(sns.color_palette("BuGn", 14).as_hex())
s = cm_df.style.background_gradient(cmap=my_cmap)
s

Unnamed: 0,Company,EducationalInstitution,Artist,Athlete,OfficeHolder,MeanOfTransportation,Building,NaturalPlace,Village,Animal,Plant,Album,Film,WrittenWork
Company,2922,42,50,79,23,99,98,80,96,73,29,224,124,61
EducationalInstitution,45,3829,2,9,5,17,35,3,10,5,2,9,16,13
Artist,81,4,1025,1938,468,32,19,53,131,43,8,80,107,11
Athlete,34,0,542,2807,311,13,9,53,111,30,7,17,62,4
OfficeHolder,21,3,631,1626,1399,22,37,29,122,17,3,11,78,1
MeanOfTransportation,151,7,41,43,17,3470,28,22,34,35,9,68,67,8
Building,182,127,27,29,19,27,3291,71,40,9,10,52,101,15
NaturalPlace,125,4,50,61,9,13,46,3220,220,66,31,49,105,1
Village,106,2,36,131,32,6,25,131,3218,84,39,52,132,6
Animal,92,0,26,40,4,15,9,45,77,2818,747,56,68,3


In [None]:
test.head(5)

# Save Model

In [36]:
net_path = 'trained_net.pt'
torch.save(net, net_path)

  "type " + obj.__name__ + ". It won't be checked "


# Test Run

In [17]:
### Load model
net_path = 'trained_net.pt'
net = torch.load(net_path)

net.eval()

input_names = ['Hello', 'aaaaaaaaaaaaaaa']
message_embedding = sess.run(encoding_tensor, feed_dict={similarity_input_placeholder: input_names})

# Uncomment 2 lines below for sample:
input_names = all_data['Name'].iloc[0:4]
message_embedding = all_data['Name_emb'].iloc[0:4]

tensor_input = torch.Tensor(message_embedding).to(device)
tensor_input
predicted_class_score = [(int(torch.argmax(x)), max(x)) for x in net(tensor_input)]

print('{0:<40} {1:<20} {2:<10}'.format('Name', 'Class', 'Score'))
print()
for enum, x in enumerate(predicted_class_score):
    print('{0:<40} {1:<20} {2:<10.2f}'.format(input_names[enum], class_dict[x[0]], x[1]))

Name                                     Class                Score     

E. D. Abbott Ltd                         OfficeHolder         0.88      
Schwan-Stabilo                           Village              0.88      
Q-workshop                               Company              1.00      
Marvell Software Solutions Israel        Company              1.00      


In [145]:
net(tensor_input)

tensor([[0.4245, 0.0451, 0.5307, 0.5348, 0.5632, 0.5189, 0.4170, 0.5000, 0.5190,
         0.5436, 0.5073, 0.4648, 0.5000, 0.5759, 0.5162],
        [0.5755, 0.9549, 0.4693, 0.4652, 0.4368, 0.4811, 0.5830, 0.5000, 0.4810,
         0.4564, 0.4927, 0.5352, 0.5000, 0.4241, 0.4838]], device='cuda:0',
       grad_fn=<SoftmaxBackward>)

In [146]:
tensor_input

tensor([[ 0.0056,  0.0254, -0.0183,  ..., -0.0174,  0.0042, -0.0457],
        [-0.0786,  0.0270,  0.0564,  ...,  0.0337,  0.0276, -0.0603]],
       device='cuda:0')

In [55]:
class_dict

{1: 'Company',
 2: 'EducationalInstitution',
 3: 'Artist',
 4: 'Athlete',
 5: 'OfficeHolder',
 6: 'MeanOfTransportation',
 7: 'Building',
 8: 'NaturalPlace',
 9: 'Village',
 10: 'Animal',
 11: 'Plant',
 12: 'Album',
 13: 'Film',
 14: 'WrittenWork'}

In [50]:
all_data.iloc[0:4]

Unnamed: 0,Class,Name,Name_emb,Class_name
0,1,E. D. Abbott Ltd,"[-0.08377792, 0.009436725, 0.023043508, -0.025...",Company
1,1,Schwan-Stabilo,"[-0.029201776, -0.072018564, 0.015953422, -0.0...",Company
2,1,Q-workshop,"[-0.038690265, -0.016229104, -0.03456407, -0.0...",Company
3,1,Marvell Software Solutions Israel,"[-0.0061697974, 0.039221164, -0.017827868, -0....",Company
