In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [None]:
data = pd.read_csv('Suicide_Detection1.csv')

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]
for i in range(0,19991):
  text = re.sub('[^a-zA-Z]',' ',data['text'][i])
  text = text.lower()
  text = text.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
  text = ' '.join(text)
  corpus.append(text)

data['clean_text']=corpus


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install torch_geometric
!pip install tensorboardX

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
pip install  dgl -f https://data.dgl.ai/wheels/torch-2.3/cu121/repo.html

Looking in links: https://data.dgl.ai/wheels/torch-2.3/cu121/repo.html


In [None]:
import dgl
import torch
graph=dgl.DGLGraph()
doc_nodes=list(data.index)
graph.add_nodes(len(doc_nodes), {'node_feature': torch.zeros(len(doc_nodes))}) #adding nodes to the graph
#taking unique words from the data set and add into the graph nodes
vocab = set()
for text in data['clean_text']:
    vocab.update(text.split())
vocab = list(vocab)
word_nodes = list(range(len(doc_nodes), len(doc_nodes) + len(vocab)))
graph.add_nodes(len(word_nodes), {'node_type':torch.ones(len(word_nodes))})
word_to_node_id={word: idx for idx, word in zip(word_nodes, vocab)} # for creating a mapping between the word to node
#adding edges
src, dst = [], []
for doc_id, text in enumerate(data['clean_text']):
    words = text.split()
    for word in words:
        src.append(doc_id)
        dst.append(word_to_node_id[word])
graph.add_edges(src, dst)
graph.add_edges(dst, src)
print('Number of nodes:', graph.number_of_nodes())
print('Number of edges:', graph.number_of_edges())
print('Node types:', graph.ndata['node_type'])



Number of nodes: 47290
Number of edges: 2369800
Node types: tensor([0., 0., 0.,  ..., 1., 1., 1.])


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv
class HeteroGraphNN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(HeteroGraphNN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, num_classes)

    def forward(self, g, features):
        h = F.relu(self.conv1(g, features))
        h = self.conv2(g, h)
        return h
in_feats = 100
hidden_size = 128
num_classes = 2
model = HeteroGraphNN(in_feats, hidden_size, num_classes)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
train_idx, test_idx = train_test_split(data.index, test_size=0.2, random_state=42)
train_mask = torch.zeros(len(data), dtype=torch.bool)
test_mask = torch.zeros(len(data), dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

In [None]:
from sklearn.preprocessing import LabelEncoder
import torch
label_encoder = LabelEncoder()
data['class_encoded'] = label_encoder.fit_transform(data['class'])
labels = torch.tensor(data['class_encoded'].values) # for converting the encoded class into tensor

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
graph = dgl.add_self_loop(graph)

In [None]:
from sklearn.metrics import accuracy_score
features = torch.randn(graph.number_of_nodes(), in_feats) # taking random features for evaluation
for epoch in range(200):
    model.train()
    logits = model(graph, features)
    loss = loss_function(logits[:len(labels)][train_mask], labels[train_mask])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        _, indices = torch.max(logits[:len(labels)], dim=1)
        acc = accuracy_score(labels[test_mask].detach().numpy(),indices[test_mask].detach().numpy())
        print(f'Epoch {epoch}, Loss: {loss.item()}, Accuracy: {acc}')


Epoch 0, Loss: 1.6769903898239136, Accuracy: 0.4661165291322831
Epoch 10, Loss: 1.2263473272323608, Accuracy: 0.5981495373843461
Epoch 20, Loss: 0.9707304239273071, Accuracy: 0.6059014753688422
Epoch 30, Loss: 0.8132907152175903, Accuracy: 0.6289072268067016
Epoch 40, Loss: 0.6934655904769897, Accuracy: 0.6349087271817955
Epoch 50, Loss: 0.6034557223320007, Accuracy: 0.6431607901975493
Epoch 60, Loss: 0.5376977324485779, Accuracy: 0.657664416104026
Epoch 70, Loss: 0.48757871985435486, Accuracy: 0.6651662915728932
Epoch 80, Loss: 0.4485003650188446, Accuracy: 0.6694173543385846
Epoch 90, Loss: 0.4172058403491974, Accuracy: 0.6746686671667917
Epoch 100, Loss: 0.3915194869041443, Accuracy: 0.6801700425106276
Epoch 110, Loss: 0.36991795897483826, Accuracy: 0.6824206051512878
Epoch 120, Loss: 0.3516450524330139, Accuracy: 0.6889222305576395
Epoch 130, Loss: 0.33596330881118774, Accuracy: 0.6949237309327332
Epoch 140, Loss: 0.32221171259880066, Accuracy: 0.698174543635909
Epoch 150, Loss: 0.

In [None]:
#evaluation
model.eval()
with torch.no_grad():
    logits = model(graph, features)
    _, indices = torch.max(logits[:len(labels)], dim=1)
    acc = accuracy_score(labels[test_mask].detach().numpy(),indices[test_mask].detach().numpy())
    print(f'Test Accuracy: {acc}')

Test Accuracy: 0.7701925481370343
