# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import math
from collections import OrderedDict
from itertools import combinations
from tqdm import tqdm
import resource
import gc

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
df = pd.read_excel('/content/gdrive/MyDrive/GNN Intern/Training_Data.xlsx')
print(df.shape)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df[df["Business Description"].isnull() == True]

In [None]:
df['Business Description'] = np.where(df["Business Description"].isnull() == True,df["Company Name"],df["Business Description"])
df.isnull().sum()

In [None]:
df.drop_duplicates(keep=False,inplace=True)

In [None]:
df.info()
df.describe()

In [None]:
df["Business Description"].str.len().describe()

In [None]:
df["Business Description"].str.len().plot()

In [None]:
df["Business Description"].str.len().plot.box()

In [None]:
df.iloc[:,1] = df.iloc[:,1].str.lower()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

In [None]:
df.drop('company name',1,inplace=True)
df.reset_index(drop=True)

In [None]:
classes = {typ:i for i,typ in enumerate(df.iloc[:,1].unique())}
#classes

In [None]:
df['industry classification tag'].replace(classes,inplace=True)

# Graph preparation

In [None]:
def nCr(n,r):
    f = math.factorial
    return int(f(n)/(f(r)*f(n-r)))

def dummy_function(doc): return doc

In [None]:
df['business description'] = df['business description'].apply(lambda x: nltk.word_tokenize(x))

In [None]:
stopwords = list(set(nltk.corpus.stopwords.words("english")))

In [None]:
def filter_tokens(tokens, stopwords):
    tks = []
    for token in tokens:
        if (token not in stopwords) and (token not in [".",",",";","&","'s", ":", "?", "!","(",")",\
            "'","`","''","\"","“"," ","'m","'no","***","--","...","[","]","{","}","~","@","#","$","%","^","*","/","<",">","+","-","="]):
            tks.append(token)
    return tks

In [None]:
df['business description'] = df['business description'].apply(lambda x: filter_tokens(x, stopwords))

In [None]:
vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=dummy_function, preprocessor=dummy_function)
vectorizer.fit(df['business description'])

In [None]:
vocabulary = vectorizer.get_feature_names()
vocabulary = np.array(vocabulary)

In [None]:
len(vocabulary)

## TF-IDF preparation (document-word edge weights)

In [None]:
df_tf_idf = vectorizer.transform(df['business description'])
df_tf_idf = df_tf_idf.toarray()
df_tf_idf = pd.DataFrame(df_tf_idf,columns=vocabulary)
df_tf_idf.shape

In [None]:
G = nx.Graph()

G.add_nodes_from(df_tf_idf.index)
G.add_nodes_from(vocabulary)

document_word = [(doc,w,{"weight":df_tf_idf.loc[doc,w]}) for doc in tqdm(df_tf_idf.index, total=len(df_tf_idf.index))\
                     for w in df_tf_idf.columns]

G.add_edges_from(document_word)
del df_tf_idf
gc.collect()

##PMI preparation (word-word edge weights)

In [None]:
word2index_n_i = OrderedDict()
for index,name in enumerate(vocabulary):
  word2index_n_i[name] = [index,0]
occurrences = np.zeros((len(vocabulary),len(vocabulary)) ,dtype=np.int32)

In [None]:
no_of_windows = 0
window = 15
for l in tqdm(df['business description'], total=len(df['business description'])):
  for i in range(len(l)-window):
    no_of_windows += 1
    d = set(l[i:(i+window)])

    for w in d:
      word2index_n_i[w][1] += 1
    for w1,w2 in combinations(d,2):
      i1 = word2index_n_i[w1][0]
      i2 = word2index_n_i[w2][0]

      occurrences[i1][i2] += 1
      occurrences[i2][i1] += 1

In [None]:
p_ij = pd.DataFrame(occurrences, index = vocabulary,columns=vocabulary)/no_of_windows
del occurrences
gc.collect()

In [None]:
for col in p_ij.columns:
  p_ij[col] = p_ij[col]/word2index_n_i[col][1]/(no_of_windows)
for row in p_ij.index:
  p_ij.loc[row,:] = p_ij.loc[row,:]/word2index_n_i[row][1]/(no_of_windows)

del word2index_n_i
p_ij += 1E-9
for col in p_ij.columns:
  p_ij[col] = p_ij[col].apply(lambda x: math.log(x))

In [None]:
def word_word_edges(p_ij):
    word_word = []
    cols = list(p_ij.columns)
    cols = [str(w) for w in cols]
    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
    return word_word

In [None]:
word_word = word_word_edges(p_ij)
del p_ij
G.add_edges_from(word_word)
gc.collect()

# GCN Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class gcn(nn.Module):
  def __init__(self,X_size, A_hat, bias=True):
    super(gcn,self).__init__()
    self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
    self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, hidden_size_1))
    var = 2./(self.weight.size(1)+self.weight.size(0))
    self.weight.data.normal_(0,var)
    self.weight2 = nn.parameter.Parameter(torch.FloatTensor(hidden_size_1, hidden_size_2))
    var2 = 2./(self.weight2.size(1)+self.weight2.size(0))
    self.weight2.data.normal_(0,var2)

    if bias:
      self.bias = nn.parameter.Parameter(torch.FloatTensor(hidden_size_1))
      self.bias.data.normal_(0,var)
      self.bias2 = nn.parameter.Parameter(torch.FloatTensor(hidden_size_2))
      self.bias2.data.normal_(0,var2)

    else:
      self.register_parameter("bias", None)
    self.fc1 = nn.Linear(hidden_size_2, num_classes)

  def forward(self,X):
    X = torch.mm(X, self.weight)
    if self.bias is not None:
      X = (X + self.bias)
    X = F.relu(torch.mm(self.A_hat, X))
    X = torch.mm(X, self.weight2)
    if self.bias2 is not None:
      X = (X + self.bias2)
    X = F.relu(torch.mm(self.A_hat, X))
    return self.fc1(X)

# Optimization

In [None]:
A = nx.to_numpy_matrix(G, weight="weight")
A = A + np.eye(G.number_of_nodes())

In [None]:
degrees = []
for d in G.degree(weight=None):
  if d == 0:
    degrees.append(0)
  else:
    degrees.append(d[1]**(-0.5))
degrees = np.diag(degrees)
X = np.eye(G.number_of_nodes())
A_hat = degrees@A@degrees

In [None]:
#X = X.numpy()
f = X
X = torch.from_numpy(X)

test_idxs = []

for b_id in df['industry classification tag'].unique():
  dum = df_data[df['industry classification tag'] == b_id]
  if len(dum) >= 4:
    test_idxs.extend(list(np.random.choice(dum.index, size=round(0.2*len(dum)), replace=False)))

In [None]:
selected = []
for i in range(len(df_data)):
  if i not in test_idxs:
    selected.append(i)

f_selected = f[selected]; f_selected = torch.from_numpy(f_selected).float()
labels_selected = [l for idx, l in enumerate(df['industry classification tag']) if idx in selected]
f_not_selected = f[test_idxs]; f_not_selected = torch.from_numpy(f_not_selected).float()
labels_not_selected = [l for idx, l in enumerate(df['industry classification tag']) if idx not in selected]
f = torch.from_numpy(f).float()

In [None]:
#arguments
hidden_size_1 = 330
hidden_size_2 = 130
num_classes = 62

In [None]:
net = gcn(X.shape[1],A_hat)
criterion = nn.CrossEntropyLoss()

In [None]:
def get_num_correct(preds,labels):
  return preds.argmax(dim=1).eq(labels).sum().item()

In [None]:
lr = 0.3
'''
model_save_name = 'company description.pt'
path = F"/content/gdrive/My Drive/GNN Intern/saved_models/{model_save_name}"
net.load_state_dict(torch.load(path))
net.train() '''

In [None]:
optimizer = optim.Adam(net.parameters(), lr)
output = net(f)
loss = criterion(output[selected], torch.tensor(labels_selected).long())
loss.backward()
optimizer.step()

In [None]:
n = 1000
for i in range(n):
  optimizer.zero_grad()
  output = net(f)
  loss = criterion(output[selected], torch.tensor(labels_selected).long())
  loss.backward()
  optimizer.step()

  t_out = output[test_idxs]
  print('epoch: ',i,' loss: ',loss.item(),' accuracy: ',get_num_correct(t_out,torch.Tensor(labels_not_selected).long())/len(labels_not_selected))

In [None]:
model_save_name = 'company description.pt'
path = F"/content/gdrive/My Drive/GNN Intern/saved_models/{model_save_name}"
torch.save(.state_dict(),path)