In [1]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import numpy as np
from math import log

import torch
import torch.nn as nn

In [2]:
class CONFIG(object):
    """docstring for CONFIG"""
    def __init__(self):
        super(CONFIG, self).__init__()
        
        self.model = 'gcn'  # 'gcn', 'gcn_cheby', 'dense'
        self.learning_rate = 0.02   # Initial learning rate.
        self.epochs  = 200  # Number of epochs to train.
        self.hidden1 = 200  # Number of units in hidden layer 1.
        self.dropout = 0.5  # Dropout rate (1 - keep probability).
        self.weight_decay = 0.   # Weight for L2 loss on embedding matrix.
        self.early_stopping = 10 # Tolerance for early stopping (# of epochs).
        self.max_degree = 3      # Maximum Chebyshev polynomial degree.

cfg = CONFIG()

In [3]:
data = pd.read_csv("/Users/007yemmar/Blogpost-Processed.csv")

In [4]:
data.head()

Unnamed: 0,gender,age,text
0,1,3,urllink kinki bed part ii
1,1,2,ha oh well good luck trippin tri buy weird st...
2,1,0,scene today im go someth im go regret im go p...
3,0,3,per chanc wander site dont know let introduc ...
4,0,4,happi bunni im sleep well moment various reas...


In [5]:
data.shape

(10220, 3)

In [6]:
data.shape[0]

10220

In [7]:
for index, row in data.iterrows():
  if row['age']==0 and row['gender']==0:
    data.loc[index, 'label'] = 0
  elif row['age']==0 and row['gender']==1:
    data.loc[index, 'label'] = int(1)
  elif row['age']==1 and row['gender']==0:
    data.loc[index, 'label'] = int(2)
  elif row['age']==1 and row['gender']==1:
    data.loc[index, 'label'] = int(3)
  elif row['age']==2 and row['gender']==0:
    data.loc[index, 'label'] = int(4)
  elif row['age']==2 and row['gender']==1:
    data.loc[index, 'label'] = int(5)
  elif row['age']==3 and row['gender']==0:
    data.loc[index, 'label'] = int(6)
  elif row['age']==3 and row['gender']==1:
    data.loc[index, 'label'] = int(7)
  elif row['age']==4 and row['gender']==0:
    data.loc[index, 'label'] = int(8)
  elif row['age']==4 and row['gender']==1:
    data.loc[index, 'label'] = int(9)
  elif row['age']==5 and row['gender']==0:
    data.loc[index, 'label'] = int(10)
  elif row['age']==5 and row['gender']==1:
    data.loc[index, 'label'] = int(11)
  elif row['age']==6 and row['gender']==0:
    data.loc[index, 'label'] = int(12)
  elif row['age']==6 and row['gender']==1:
    data.loc[index, 'label'] = int(13)
  elif row['age']==7 and row['gender']==0:
    data.loc[index, 'label'] = int(14)
  elif row['age']==7 and row['gender']==1:
    data.loc[index, 'label'] = int(15)

In [8]:
# build vocab
word_freq = {}
word_set = set()
for doc_words in data['text']:
    doc_words = str(doc_words)
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

In [9]:
vocab = list(word_set)
vocab_size = len(vocab)

In [10]:
vocab

['doand',
 'israel-palestin',
 'youâ\x92r',
 'arrr',
 'hinckley',
 '--drive',
 'performand',
 'forest',
 'hate-lov',
 'sharman',
 'attachmentus',
 'palyeeya',
 'afgarto',
 'alchohol',
 'career-focus',
 'sanity*',
 'nick',
 'mwahahaha',
 '22are',
 'cartoonish',
 'hah-hah',
 'aku',
 'گفت',
 'lynn-',
 'inter-act',
 'tarac',
 'daythes',
 'do…',
 'maghaharang',
 'himselfnbsp',
 'dates-',
 'stares]',
 'deskso',
 'whitley',
 'buh-or',
 'middle-ag',
 'evertim',
 'wassenaar',
 'lisa-',
 'awhilesi',
 'common',
 'f8',
 'coint',
 'glorfi',
 'carbohyrdr',
 's-',
 'outfit',
 'confusing',
 'startthat',
 'celeri',
 'lovelov',
 'vv',
 'map',
 'estrang',
 'teal',
 'rené',
 'post-dnc',
 'ruckus',
 'cd100ss',
 'head-of-st',
 'poweraid',
 'hoffman',
 'sio',
 'valleygotta',
 'lorzcoz',
 'thereughit',
 'workbas',
 'yeahour',
 'pseudonym',
 'myle',
 '1964',
 'kaki-bangku',
 'panadol',
 '535',
 'ofto',
 'ellum',
 'ithahaha',
 '950pm',
 'ansiosa',
 'fea”',
 'mrdempsey',
 'are”',
 'sleepcoz',
 'be-stil',
 'sati'

In [11]:
vocab_size

71843

In [12]:
word_doc_list = {}

for i in range(len(data['text'])):
    doc_words = data['text'][i]
    doc_words = str(doc_words)
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

In [13]:
len(word_doc_list)

71843

In [14]:
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

In [15]:
len(word_doc_freq)

71843

In [16]:
word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

In [17]:
data.head()

Unnamed: 0,gender,age,text,label
0,1,3,urllink kinki bed part ii,7.0
1,1,2,ha oh well good luck trippin tri buy weird st...,5.0
2,1,0,scene today im go someth im go regret im go p...,1.0
3,0,3,per chanc wander site dont know let introduc ...,6.0
4,0,4,happi bunni im sleep well moment various reas...,8.0


In [18]:
data.drop('age',axis=1,inplace=True)
data.drop('gender',axis=1,inplace=True)

In [19]:
data.head()

Unnamed: 0,text,label
0,urllink kinki bed part ii,7.0
1,ha oh well good luck trippin tri buy weird st...,5.0
2,scene today im go someth im go regret im go p...,1.0
3,per chanc wander site dont know let introduc ...,6.0
4,happi bunni im sleep well moment various reas...,8.0


In [20]:
from sklearn.model_selection import train_test_split

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.30, random_state=42)

# Split training set into training and validation sets
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

In [21]:
train_size = len(X_train)
test_size = len(X_test)
val_size = len(X_val)

In [22]:
label_set = set()
for doc_meta in data['label']:
    temp = doc_meta
    label_set.add(temp)
label_list = list(label_set)

In [23]:
label_list

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0]

In [24]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/007yemmar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
definitions = []
for word in vocab:
    word = word.strip()
    synsets = wn.synsets(word)
    word_defs = []
    for synset in synsets:
        syn_def = synset.definition()
        word_defs.append(syn_def)
    word_des = ' '.join(word_defs)
    if word_des == '':
        word_des = '<PAD>'
    definitions.append(word_des)

In [26]:
tfidf_vec = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vec.fit_transform(definitions)
tfidf_matrix_array = tfidf_matrix.toarray()
print(tfidf_matrix_array.shape)
#print(tfidf_matrix_array[0], len(tfidf_matrix_array[0]))

(71843, 1000)


In [27]:
def loadWord2Vec(file):
    """Read Word Vectors"""
    vocab = []
    embd = []
    word_vector_map = {}
    for line in file:
        row = line.strip().split(' ')
        if(len(row) > 2):
            vocab.append(row[0])
            vector = row[1:]
            length = len(vector)
            for i in range(length):
                vector[i] = float(vector[i])
            embd.append(vector)
            word_vector_map[row[0]] = vector
    return vocab, embd, word_vector_map

In [28]:
word_vectors = []
for i in range(len(vocab)):
    word = vocab[i]
    vector = tfidf_matrix_array[i]
    str_vector = []
    for j in range(len(vector)):
        str_vector.append(str(vector[j]))
    temp = ' '.join(str_vector)
    word_vector = word + ' ' + temp
    word_vectors.append(word_vector)
word_vectors

['doand 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 

In [29]:
#string = '\n'.join(word_vectors)
_, embd, word_vector_map = loadWord2Vec(word_vectors)
word_embeddings_dim = len(embd[0])

In [30]:
word_embeddings_dim

1000

In [31]:
X = X_train + X_test

In [32]:
len(X_train)

7154

In [33]:
print(train_size)

7154


In [34]:
type(X_train)

pandas.core.series.Series

In [35]:
X_train = X_train.tolist()

In [36]:
type(X_train)

list

In [37]:
data.shape[0]

10220

In [38]:
print("start")
row_x = []
col_x = []
data_x = []
lol = 0
for i in range(len(X_train)):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = X_train[i]
    if(lol%1000 == 0):
        print(lol)
    lol=lol+1
    doc_words = str(doc_words)
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_x.append(i)
        col_x.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_x.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

start
0
1000
2000
3000
4000
5000
6000
7000


In [39]:
# x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32)
x = sp.csr_matrix((data_x, (row_x, col_x)), shape=(len(X_train), word_embeddings_dim))

In [40]:
y_train = y_train.tolist()

In [41]:
y = []
for i in range(len(y_train)):
    label = y_train[i]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    y.append(one_hot)
y = np.array(y)
print(y)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]]


In [42]:
print(x)

  (0, 0)	0.0
  (0, 1)	0.0
  (0, 2)	0.0
  (0, 3)	0.0016374661839120542
  (0, 4)	0.0
  (0, 5)	0.005528181706219123
  (0, 6)	0.001078091775075936
  (0, 7)	0.0005468088524196532
  (0, 8)	0.0
  (0, 9)	0.0
  (0, 10)	0.0
  (0, 11)	0.001892750933815198
  (0, 12)	0.007257938070771865
  (0, 13)	0.0
  (0, 14)	0.0
  (0, 15)	0.0025901293748352362
  (0, 16)	0.010546884702923852
  (0, 17)	0.0
  (0, 18)	0.0
  (0, 19)	0.0012307354540113111
  (0, 20)	0.0015261444010872644
  (0, 21)	0.0
  (0, 22)	0.006903334485020152
  (0, 23)	0.0
  (0, 24)	0.004919947439989009
  :	:
  (7153, 975)	0.0
  (7153, 976)	0.0
  (7153, 977)	0.026058323506091736
  (7153, 978)	0.0037275624413388793
  (7153, 979)	0.009215140151330833
  (7153, 980)	0.0
  (7153, 981)	0.001608875015301419
  (7153, 982)	0.0
  (7153, 983)	0.0
  (7153, 984)	0.0
  (7153, 985)	0.0030199025311872982
  (7153, 986)	0.0013209895378572898
  (7153, 987)	0.0
  (7153, 988)	0.0
  (7153, 989)	0.0
  (7153, 990)	0.006831123967533577
  (7153, 991)	0.0
  (7153, 992)	0.0

In [43]:
y_val = y_val.tolist()
y_test = y_test.tolist()
X_test = X_test.tolist()
X_val = X_val.tolist()

In [44]:
print("start")
row_tx = []
col_tx = []
data_tx = []
lol = 0
for i in range(len(X_test)):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = X_test[i]
    if(lol%1000 == 0):
        print(lol)
    lol=lol+1
    doc_words = str(doc_words)
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_x.append(i)
        col_x.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_tx.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

start
0
1000


In [45]:
ty = []
for i in range(len(y_test)):
    label = y_test[i]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ty.append(one_hot)
ty = np.array(ty)
print(ty)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 [0 1 0 ... 0 0 0]]


In [46]:
type(X_train)
print(type(X_val))

<class 'list'>


In [47]:
X_all = X_train + X_val
y_all = y_train + y_val

In [48]:
len(X_all)
len(y_all)

9208

In [49]:
print("start")
row_allx = []
col_allx = []
data_allx = []
lol = 0
for i in range(len(X_all)):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = X_all[i]
    if(lol%1000 == 0):
        print(lol)
    lol=lol+1
    doc_words = str(doc_words)
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_allx.append(i)
        col_allx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_allx.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

start
0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [50]:
allx = sp.csr_matrix((data_allx, (row_allx, col_allx)), shape=(len(X_all), word_embeddings_dim))

In [52]:
allx

<9208x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 9208000 stored elements in Compressed Sparse Row format>

In [51]:
ally = []
for i in range(len(y_all)):
    label = y_all[i]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ally.append(one_hot)
ally = np.array(ally)
print(ally)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [53]:
# word co-occurence with context windows
print("start")
window_size = 20
windows = []
lol = 0
for doc_words in data['text']:
    if(lol%10000 ==0):
        print(lol)
    lol = lol+1
    doc_words = str(doc_words)
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        # print(length, length - window_size + 1)
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)

start
0
10000


In [54]:
len(windows)

914962

In [55]:
#add counter

word_window_freq = {}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])

In [56]:
len(word_window_freq)

71843

In [57]:
word_pair_count = {}
lol = 0
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

In [58]:
row = []
col = []
weight = []

# pmi as weights

num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) / (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)
    col.append(train_size + j)
    weight.append(pmi)

In [59]:
len(weight)

10942494

In [60]:
# doc word frequency
doc_word_freq = {}

for doc_id in range(len(data['text'])):
    doc_words = data['text'][doc_id]
    doc_words = str(doc_words)
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(data['text'])):
    doc_words = data['text'][i]
    doc_words = str(doc_words)
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(data['text']) / word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

In [61]:
node_size = len(X_all) + vocab_size + len(X_test)
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))

In [62]:
print(adj)

  (0, 25986)	2.4915825041514723
  (0, 44755)	3.1052326796435104
  (0, 48466)	1.2679461449167582
  (0, 74850)	6.834206590959325
  (0, 78657)	5.137757301535594
  (1, 7267)	4.175856058409387
  (1, 7909)	3.220834689353534
  (1, 10013)	2.1933183223691546
  (1, 19724)	2.4186572642467996
  (1, 20206)	8.53895468319775
  (1, 22399)	3.291930611037264
  (1, 23835)	13.66841318191865
  (1, 26846)	3.6117009980405452
  (1, 28014)	1.0119679066058358
  (1, 33552)	3.134027581591455
  (1, 41124)	3.379899383983221
  (1, 42426)	1.3400234425095763
  (1, 46488)	1.6585706010117451
  (1, 56208)	0.9914530003827831
  (1, 66630)	1.8235712968630693
  (1, 68746)	4.469927928959939
  (1, 75875)	1.381608682886555
  (2, 7489)	5.648582925301586
  (2, 7865)	2.949835116861689
  (2, 7866)	5.844367171062357
  :	:
  (82061, 69452)	3.0793691690535914
  (82061, 69706)	7.1526603220778595
  (82061, 69913)	5.012594158581589
  (82061, 70115)	9.232101863757695
  (82061, 70911)	4.813261255961097
  (82061, 70934)	4.067315889834181
  

In [63]:
features = sp.identity(adj.shape[0]) 

In [64]:
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [65]:
class GraphConvolution(nn.Module):
    def __init__( self, input_dim, \
                        output_dim, \
                        support, \
                        act_func = None, \
                        featureless = False, \
                        dropout_rate = 0., \
                        bias=False):
        super(GraphConvolution, self).__init__()
        self.support = support
        self.featureless = featureless

        for i in range(len(self.support)):
            setattr(self, 'W{}'.format(i), nn.Parameter(torch.randn(input_dim, output_dim)))

        if bias:
            self.b = nn.Parameter(torch.zeros(1, output_dim))

        self.act_func = act_func
        self.dropout = nn.Dropout(dropout_rate)

        
    def forward(self, x):
        x = self.dropout(x)

        for i in range(len(self.support)):
            if self.featureless:
                pre_sup = getattr(self, 'W{}'.format(i))
            else:
                pre_sup = x.mm(getattr(self, 'W{}'.format(i)))
            
            if i == 0:
                out = self.support[i].mm(pre_sup)
            else:
                out += self.support[i].mm(pre_sup)

        if self.act_func is not None:
            out = self.act_func(out)

        self.embedding = out
        return out


In [66]:
class GCN(nn.Module):
    def __init__( self, input_dim, \
                        support,\
                        dropout_rate=0., \
                        num_classes=10):
        super(GCN, self).__init__()
        
        # GraphConvolution
        self.layer1 = GraphConvolution(input_dim, 200, support, act_func=nn.ReLU(), featureless=True, dropout_rate=dropout_rate)
        self.layer2 = GraphConvolution(200, num_classes, support, dropout_rate=dropout_rate)
        
    
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        return out

In [67]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    # return sparse_to_tuple(adj_normalized)
    return adj_normalized.A

In [68]:
if cfg.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN

In [83]:
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [118]:
tm_train_mask.shape[0]

10220

In [85]:
idx_train = range(len(y))
labels = np.vstack((ally, ty))

train_mask = sample_mask(idx_train, labels.shape[0])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.array(mask, dtype=np.bool)


In [94]:
t_features = torch.from_numpy(features.toarray())

In [117]:
t_y_train = torch.from_numpy(np.array(y_train)) #y
t_y_val = torch.from_numpy(np.array(y_val)) #ally - y - ty
t_y_test = torch.from_numpy(np.array(y_test)) #ty
t_train_mask = torch.from_numpy(train_mask.astype(np.float32))
tm_train_mask = torch.transpose(torch.unsqueeze(t_train_mask, 0), 1, 0).repeat(1, len(y_train))

t_support = []
for i in range(len(support)):
    t_support.append(torch.Tensor(support[i]))

In [70]:
adj.shape[0]

82063

In [73]:
type(y_train)

list

In [75]:
support

[array([[0.05041155, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.01631036, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.00067017, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.0020086 , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.00124688,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.16336591]])]

In [74]:
model = model_func(input_dim=features.shape[0], support=t_support, num_classes=len(y_train))

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

In [88]:
def evaluate(features, labels, mask):
    t_test = time.time()
    # feed_dict_val = construct_feed_dict(
    #     features, support, labels, mask, placeholders)
    # outs_val = sess.run([model.loss, model.accuracy, model.pred, model.labels], feed_dict=feed_dict_val)
    model.eval()
    with torch.no_grad():
        logits = model(features)
        t_mask = torch.from_numpy(np.array(mask*1., dtype=np.float32))
        tm_mask = torch.transpose(torch.unsqueeze(t_mask, 0), 1, 0).repeat(1, labels.shape[1])
        loss = criterion(logits * tm_mask, torch.max(labels, 1)[1])
        pred = torch.max(logits, 1)[1]
        acc = ((pred == torch.max(labels, 1)[1]).float() * t_mask).sum().item() / t_mask.sum().item()
        
    return loss.numpy(), acc, pred.numpy(), labels.numpy(), (time.time() - t_test)


In [None]:
tm_train_mask_transp = torch.transpose(tm_train_mask, 0 , 1)

In [142]:
val_losses = []

# Train model
for epoch in range(15):
    
    # Forward pass
    logits = model(t_features)
    loss = criterion(logits * tm_train_mask, torch.max(t_y_train, 1)[1])    
    acc = ((torch.max(logits, 1)[1] == torch.max(t_y_train, 1)[1]).float() * t_train_mask).sum().item() / t_train_mask.sum().item()
        
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Validation
    val_loss, val_acc, pred, labels, duration = evaluate(t_features, t_y_val, val_mask)
    val_losses.append(val_loss)

    print_log("Epoch: {:.0f}, train_loss= {:.5f}, train_acc= {:.5f}, val_loss= {:.5f}, val_acc= {:.5f}, time= {:.5f}"\
                .format(epoch + 1, loss, acc, val_loss, val_acc, time.time() - t))

    if epoch > cfg.early_stopping and val_losses[-1] > np.mean(val_losses[-(cfg.early_stopping+1):-1]):
        print_log("Early stopping...")
        break


print_log("Optimization Finished!")

RuntimeError: The size of tensor a (7154) must match the size of tensor b (10220) at non-singleton dimension 1

In [145]:
tm_train_mask.shape

torch.Size([10220, 7154])

In [146]:
logits.shape

torch.Size([82063, 7154])

In [147]:
tm_train_mask_transp.shape

torch.Size([7154, 10220])