<a href="https://colab.research.google.com/github/Ankan1998/Text-GCN/blob/main/text_gcn_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

## Importing dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
len(newsgroups_train.data)

11314

## Reducing Dataset

### Reduced dataset or else RAM is overflowing
### RAM was crashing everytime

In [None]:
newsgroups_train.data=newsgroups_train.data[:100]


In [None]:
y=newsgroups_train.target[:100]

In [None]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

## Cleaning Dataset

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')

In [None]:
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text



In [None]:
for i in range(len(newsgroups_train.data)):
  newsgroups_train.data[i]=clean_header(newsgroups_train.data[i])

In [None]:
def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text



In [None]:
for i in range(len(newsgroups_train.data)):
  newsgroups_train.data[i]=clean_text(newsgroups_train.data[i])

In [None]:
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def rem_stopwords(dataset,stop_words):
  for i in range(len(dataset)):
    temp=[]
    for word in dataset[i].split(" "):
      
      if word not in stop_words:
        temp.append(word)
    p=" ".join(temp)
    dataset[i]=p
  return dataset

In [None]:
newsgroups_train.data=rem_stopwords(newsgroups_train.data,stop_words)

In [None]:
l=[]
for i in newsgroups_train.data:
  l.extend(list(set(i.split(" "))))
a=list(set(l))

In [None]:
len(sorted(a))

5589

# **GCN preprocessing**

## TFIDF Vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Top 5k features are taken
vectorizer = TfidfVectorizer(input="content",max_features=5000)
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [None]:
vectors

<100x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 10286 stored elements in Compressed Sparse Row format>

In [None]:
df_tfidf = vectors.toarray()
top_words = vectorizer.get_feature_names()
top_words = np.array(top_words)
df_tfidf = pd.DataFrame(df_tfidf,columns=top_words)

In [None]:
df_tfidf

Unnamed: 0,aario,ab,abad,abandoned,abdomens,abilene,able,ables,aboard,abode,abp,abpsoft,abraham,abs,absolute,abstract,absurd,abuse,abuses,academy,accel,acceleration,acceptance,accepted,access,accessdigexnet,accessed,accessible,accessories,accessory,accident,accidental,accidentally,accidents,accommodation,accompanied,accompanying,accomplish,account,accounts,...,ye,yea,yeah,year,yearly,years,yehiam,yellow,yep,yerevan,yes,yesterday,yet,yhwh,yo,yoke,york,yoshiro,youd,youll,young,younger,youre,youve,yrs,ysebaert,zangezour,zangibasar,zazula,zealand,zeikheil,zenier,zero,zilkade,zod,zone,zoologists,zoom,zuma,zx
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.096823,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.08584,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.09526,0.0,0.084083,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.041431,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.169487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.102480,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.145618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Point-wise Mutual Information

In [None]:
from collections import OrderedDict
from tqdm import tqdm

In [None]:
### PMI between words
window = 20 # sliding window size to calculate point-wise mutual information between words
names = top_words
ncount  = OrderedDict((name, 0) for name in names)
word_index = OrderedDict((name,index) for index,name in enumerate(names))


In [None]:
occ = np.zeros( (len(names),len(names)) ,dtype=np.int32)

In [None]:
# Find the co-occurrences:
# Will take huge tim, so saved as co_mat1000.npy file
from itertools import combinations
no_windows = 0
# taking each data from dataset
for l in tqdm(newsgroups_train.data,total=len(newsgroups_train.data)):
    # Iterating over each word in the line "l"
    for i in range(len(l.split(" "))-window):
        # Counting windows
        no_windows += 1
        # Getting unique words within the window
        d = set(l.split(" ")[i:(i+window)])
        d=d.intersection(set(names))
        # Counting occurrences of each word on whole document "ncount"
        for w in d:
            ncount[w] += 1
        # Combination of two words 
        for w1,w2 in combinations(d,2):
            i1 = word_index[w1]
            i2 = word_index[w2]
            # Counting word-word
            occ[i1][i2] += 1
            occ[i2][i1] += 1
    

100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


In [None]:
# 10000 words , datapoint=1000, window_size 20 ---> no_windows == 128003
# 5000 words ,datapoint=100, window_size 20 ---> no_windows == 12625
no_windows

12625

In [None]:
np.save("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/co_mat_100.npy",occ)

In [None]:
occ=np.load("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/co_mat_100.npy")

In [None]:
np.save("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/topwords_100.npy",names)

In [None]:
names=np.load("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/topwords_100.npy")

In [None]:
names.shape

(10000,)

In [None]:
# convert to PMI
import math
no_windows=12625
p_i_j = pd.DataFrame(occ, index = names,columns=names)/no_windows
p_i = pd.Series(ncount, index=ncount.keys())/no_windows

In [None]:
p_i

aario         0.001584
ab            0.000000
abad          0.001584
abandoned     0.001584
abdomens      0.001584
                ...   
zone          0.003168
zoologists    0.001584
zoom          0.001584
zuma          0.000317
zx            0.000713
Length: 5000, dtype: float64

In [None]:
for col in p_i_j.columns:
    p_i_j[col] = p_i_j[col]/p_i[col]
for row in p_i_j.index:
    p_i_j.loc[row,:] = p_i_j.loc[row,:]/p_i[row]
p_i_j = p_i_j + 1E-9
for col in p_i_j.columns:
    p_i_j[col] = p_i_j[col].apply(lambda x: math.log(x))

In [None]:
from itertools import combinations
def word_word_edges(p_i_j):
    word_word = []
    cols = list(p_i_j.columns)
    cols = [str(w) for w in cols]
    for w1, w2 in tqdm(combinations(cols, 2)):
        if (p_i_j.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_i_j.loc[w1,w2]}))
    return word_word

In [None]:
import pickle
def load_pickle(filename):

    with open(filename, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data

def save_as_pickle(filename, data):
    with open(filename, 'wb') as output:
        pickle.dump(data, output)


In [None]:
import networkx as nx
### Build graph
G = nx.Graph()
G.add_nodes_from(df_tfidf.index) ## document nodes
G.add_nodes_from(top_words) ## word nodes
### build edges between document-word pairs
document_word = [(doc,w,{"weight":df_tfidf.loc[doc,w]}) for doc in df_tfidf.index for w in df_tfidf.columns]
G.add_edges_from(document_word)
### build edges between word-word pairs
word_word = word_word_edges(p_i_j)
G.add_edges_from(word_word)

12497500it [01:31, 136081.60it/s]


In [None]:
import pickle
save_as_pickle("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/doc_graph100.pkl", G)

In [None]:
import pickle
G=load_pickle("/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/doc_graph100.pkl")

In [None]:
import networkx as nx
# Adjaceny matrix of G
A = nx.to_numpy_matrix(G, weight="weight") 
# A=A+I
A = A + np.eye(G.number_of_nodes())
# Creating degree matrix
degrees = []
for d in G.degree(weight=None):
    if d == 0:
        degrees.append(0)
    else:
        degrees.append(d[1]**(-0.5))
degrees = np.diag(degrees)
X = np.eye(G.number_of_nodes()) # Features are just identity matrix
# Normalized Laplacian 
A_hat = degrees@A@degrees
f = X 


In [None]:
A_hat.shape

(5100, 5100)

In [None]:
nx.draw(G)

## Pytorch Model

In [None]:
# Simplified GCN 
import torch
import torch.nn as nn
import torch.nn.functional as F
class GCNLayer(nn.Module):
    def __init__(self, in_dim, out_dim, acti=True):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_dim, out_dim) # bias = False 
        if acti:
            self.acti = nn.ReLU(inplace=True)
        else:
            self.acti = None
    def forward(self, F):
        output = self.linear(F)
        if not self.acti:
            return output
        return self.acti(output)


class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, p):
        super(GCN, self).__init__()
        self.gcn_layer1 = GCNLayer(input_dim, hidden_dim)
        self.gcn_layer2 = GCNLayer(hidden_dim, num_classes, acti=False)
        self.dropout = nn.Dropout(p)

    def forward(self, A, X):
        A = torch.tensor(A, requires_grad=False).float()
        F = torch.mm(A, X)
        F = self.gcn_layer1(F)
        F = self.dropout(F)
        F = torch.mm(A, F)
        output = self.gcn_layer2(F)
        return output

In [None]:
net1 = GCN(X.shape[1], hidden_dim=100,num_classes=20,p=0.1)

In [None]:
lr=0.01
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net1.parameters(), lr=lr)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
net1=net1.to(device)

In [None]:
# transforming from numpy array to tensor
f = torch.from_numpy(f).float()
y=torch.from_numpy(y).float()

In [None]:

num_epochs=100
losses_per_epoch=[]
for epoch in range(num_epochs):
    output = net1(A_hat,f)
    labels = y.to(dtype=torch.long)
    loss = criterion(output[:70,:], torch.tensor(labels[:70]))
    losses_per_epoch.append(loss.item())
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  import sys


Epoch [10/100], Loss: 2.9723
Epoch [20/100], Loss: 2.9396
Epoch [30/100], Loss: 2.9043
Epoch [40/100], Loss: 2.8630
Epoch [50/100], Loss: 2.8146
Epoch [60/100], Loss: 2.7598
Epoch [70/100], Loss: 2.6972
Epoch [80/100], Loss: 2.6260
Epoch [90/100], Loss: 2.5457
Epoch [100/100], Loss: 2.4577


In [None]:
torch.save(net1.state_dict(), "/content/drive/MyDrive/Jobs Assignment/Allied Media/Final Folder/gcn_100_100.pth")

# Model is only trained with 100 datapoint and for 100 epoch

# Citation


SEMI-SUPERVISED CLASSIFICATION WITH GRAPH CONVOLUTIONAL NETWORKS

ThomasN.Kipf University of Amsterdam T.N.Kipf@uva.nl
MaxWelling University of Amsterdam Canadian Institute for Advanced Research (CIFAR) M.Welling@uva.nl
***********************************************************************
GraphConvolutionalNetworksforTextClassiﬁcation

LiangYao,ChengshengMao,YuanLuo∗ Northwestern University Chicago IL 60611 {liang.yao, chengsheng.mao, yuan.luo}@northwestern.edu

************************************************************************
