In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv
/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv


In [2]:
import torch
import torch.nn.functional as F
import re
from collections import Counter
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
from torch import optim
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import tqdm as tqdm

In [3]:
df=pd.read_csv('/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv')

In [4]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


# Dataset
- The data set has already been cleaned.
- The outcomes are 0 -1 1. 0 is neutral, -1 is neg, 1 is positive.
  

In [5]:
df.shape

(37249, 2)

In [6]:
df.describe()

Unnamed: 0,category
count,37249.0
mean,0.202771
std,0.778515
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [7]:
df.isnull().sum()

clean_comment    100
category           0
dtype: int64

In [8]:
df=df.dropna()

In [9]:
df.shape

(37149, 2)

In [10]:
df.category=df.category.replace(-1,2)

# Preparing features
- Create X,y
- Create train,val,test split
- Create tokens
- Create a vocabulary
- Create encoder function
- Create a torch dataset
- Create a dataloader

In [11]:
X=df['clean_comment']

In [12]:
y=df['category']

In [13]:
X=np.asarray(X)
y=np.asarray(y)

In [14]:
np.unique(y)

array([0, 1, 2])

In [15]:
X_train,X_temp,y_train,y_temp=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp,test_size=0.5,random_state=42,stratify=y_temp)

In [16]:
X_train.shape,X_val.shape

((29719,), (3715,))

In [17]:
y_train.shape,y_val.shape

((29719,), (3715,))

In [18]:
X_test.shape,y_test.shape

((3715,), (3715,))

In [19]:
print(X_train[:1])

[' doubt upa did not manage the finances properly but the inflation during upa1 was also partly due the prevailing market conditions where rbi had for omo purchases which effect led inflation rbi reportedly bought bonds close 6lac crores during the period 2008 2013 but the indiscretion showed upa further worsened ']


In [20]:
def tokenize(text):
    return re.findall(r'\b\w+\b',text.lower())

In [21]:
tokens=[token for text in X_train for token in tokenize(text)]

In [22]:
print(tokens[:10])

['doubt', 'upa', 'did', 'not', 'manage', 'the', 'finances', 'properly', 'but', 'the']


In [23]:
vocab={'<PAD>':0,'<UNK>':1}

In [24]:
vocab.update({word : i+2 for i,(word,_) in enumerate(Counter(tokens).most_common(20000))})

In [25]:
len(vocab)

20002

In [26]:
def encode(text,seq_length=128):
    tokens=[vocab.get(token,vocab["<UNK>"]) for token in tokenize(text)]
    tokens=tokens[:seq_length]
    if(len(tokens)<seq_length):
        tokens+=[vocab['<PAD>']]*(seq_length-len(tokens))
    return tokens

In [27]:
text="I hate the way logitech does its business"
encode(text)

[1,
 205,
 2,
 81,
 14372,
 96,
 73,
 432,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [28]:
class Reviews(Dataset):
    def __init__(self,texts,labels):
        super().__init__()
        self.texts=list(texts)
        self.labels=list(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self,idx):
        return torch.tensor(encode(self.texts[idx],128)),torch.tensor(self.labels[idx])
        

In [29]:
def collate_fn(batch):
    texts,labels=zip(*batch)
    texts_padded=pad_sequence(texts,batch_first=True,padding_value=0)
    return texts_padded,torch.tensor(labels)

In [30]:
train_data=Reviews(X_train,y_train)
val_data=Reviews(X_val,y_val)
test_data=Reviews(X_test,y_test)

In [31]:
BATCH_SIZE=256

In [32]:
train_dataloader=DataLoader(train_data,shuffle=True,batch_size=BATCH_SIZE,collate_fn=collate_fn)
val_dataloader=DataLoader(val_data,batch_size=BATCH_SIZE,collate_fn=collate_fn,)
test_dataloader=DataLoader(test_data,batch_size=BATCH_SIZE,collate_fn=collate_fn)

In [33]:
for batch,(X,y) in enumerate(train_dataloader):
    print(f"The batch is {batch}")
    print(f"The embedding vector is {X}")
    print(f"The label is {y}")
    print(f"The shape of the embedding is {X.shape}") #batchsize, seq_length
    print(f"The shape of the label is {y.shape}")
    if(batch==2):
        break

The batch is 0
The embedding vector is tensor([[   11,    87,    53,  ...,     0,     0,     0],
        [   17,     2,   151,  ...,     0,     0,     0],
        [ 6016,     6,   122,  ...,     0,     0,     0],
        ...,
        [  110, 10952,     0,  ...,     0,     0,     0],
        [   18,    58,   440,  ...,     0,     0,     0],
        [   36,    13,     1,  ...,     0,     0,     0]])
The label is tensor([1, 2, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 0, 0,
        2, 0, 0, 0, 1, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 0, 2, 1, 0, 1, 0,
        1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 0, 1, 2, 0, 1, 2,
        1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 1, 0, 0, 2,
        1, 1, 0, 1, 1, 1, 1, 2, 0, 0, 1, 0, 1, 1, 2, 1, 0, 0, 2, 0, 0, 2, 2, 0,
        1, 0, 2, 2, 2, 0, 2, 0, 0, 1, 1, 1, 0, 0, 2, 1, 0, 1, 1, 0, 2, 1, 2, 2,
        0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 0, 1, 2, 1, 1, 1, 1, 2, 1, 0, 0, 0, 1, 2,
        2, 1, 0, 1, 1, 1, 

# Make a CNN-GRU Model
- Make the class
- Set up training , testing loops

In [34]:
class CNNGRU(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,num_layers,pad_idx):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx)
        self.conv1=nn.Conv1d(in_channels=embedding_dim,out_channels=64,kernel_size=3,padding=1)
        self.batch_norm1=nn.BatchNorm1d(64)
        self.conv2=nn.Conv1d(in_channels=64,out_channels=32,kernel_size=3,padding=1)
        self.batch_norm2=nn.BatchNorm1d(32)
        self.gru=nn.GRU(input_size=32,hidden_size=hidden_dim,num_layers=num_layers,
                        dropout=0.3,batch_first=True)
        self.fc=nn.Linear(hidden_dim,output_dim)

    def forward(self,x):
        #[batch_size, seq_length]
        embeds=self.embedding(x) #(batch_size,seq_elengh,embed_dim)
        embeds=embeds.permute(0,2,1) #batch_size,embed_dim,seq_length
        x=F.relu(self.conv1(embeds))
        x=self.batch_norm1(x)
        x=F.relu(self.conv2(x))
        x = self.batch_norm2(x)
        x = x.permute(0, 2, 1)
        output, hidden = self.gru(x)
        last_hidden = hidden[-1]
        out = self.fc(last_hidden)
        return out


In [35]:
device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [36]:
VOCAB_SIZE=len(vocab)
EMBED_DIM=400
HIDDEN_DIM=64
OUTPUT_DIM=3
PAD_IDX=0

In [96]:
model=CNNGRU(VOCAB_SIZE,EMBED_DIM,HIDDEN_DIM,OUTPUT_DIM,num_layers=3,pad_idx=PAD_IDX)

In [97]:
model=model.to(device)

In [98]:
model.state_dict

<bound method Module.state_dict of CNNGRU(
  (embedding): Embedding(20002, 400, padding_idx=0)
  (conv1): Conv1d(400, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (batch_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (batch_norm2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gru): GRU(32, 64, num_layers=3, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)>

In [99]:
from torch.optim import Adam

In [100]:
loss_fn=nn.CrossEntropyLoss().to(device)
optim=Adam(params=model.parameters(),lr=1e-2)

In [54]:
def acc_fn(pred,label):
    batch_size,_=pred.shape
    predicted_classes=pred.argmax(dim=-1)
    correct=predicted_classes.eq(label).sum()
    acc=correct/batch_size
    return acc

In [55]:
def train(model,dataloader,loss_fn,optim,device):
    train_loss=[]
    train_acc=[]
    model.train()
    for X,y in tqdm.tqdm(dataloader,desc='TRAINING'):
        X=X.to(device)
        y=y.to(device)
        y_pred=model(X)
        loss=loss_fn(y_pred,y)
        train_loss.append(loss.item())
        acc=acc_fn(y_pred,y)
        train_acc.append(acc.item())
        optim.zero_grad()
        loss.backward()
        optim.step()
    return np.mean(train_loss),np.mean(train_acc)

In [56]:
def test(model,dataloader,loss_fn,device):
    test_loss=[]
    test_acc=[]
    model.eval()
    with torch.inference_mode():
        for X,y in tqdm.tqdm(dataloader,desc='EVALUATING'):
            X=X.to(device)
            y=y.to(device)
            pred=model(X)
            loss=loss_fn(pred,y)
            test_loss.append(loss.item())
            acc=acc_fn(pred,y)
            test_acc.append(acc.item())

    return np.mean(test_loss),np.mean(test_acc)
            
        

In [45]:
epochs=20
for i in range(epochs):
    
    train_loss,train_acc=train(model,train_dataloader,loss_fn,optim,device)
    
    test_loss,test_acc=test(model,val_dataloader,loss_fn,device)
    print(f"For epoch :{i}")
    print(f"Train loss : {train_loss}, Train acc : {train_acc}")
    print(f"Val loss : {test_loss}, Val acc : {test_acc}")

TRAINING: 100%|██████████| 117/117 [00:04<00:00, 29.24it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 56.66it/s]


For epoch :0
Train loss : 1.024981383075062, Train acc : 0.46796758867736554
Val loss : 0.9235113104184468, Val acc : 0.5780474702517192


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.83it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.00it/s]


For epoch :1
Train loss : 0.6167926951351329, Train acc : 0.741840568363157
Val loss : 0.4537093500296275, Val acc : 0.8432311534881591


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.36it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.92it/s]


For epoch :2
Train loss : 0.3300126146556985, Train acc : 0.8773109439091805
Val loss : 0.3568248947461446, Val acc : 0.8759243806203206


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 44.35it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.69it/s]


For epoch :3
Train loss : 0.24161573085520002, Train acc : 0.9140915320469782
Val loss : 0.3589925189812978, Val acc : 0.892851463953654


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.60it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 58.98it/s]


For epoch :4
Train loss : 0.16969640562549615, Train acc : 0.9428752670940171
Val loss : 0.30962363878885907, Val acc : 0.9019302646319072


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.60it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 54.12it/s]


For epoch :5
Train loss : 0.12649314064118597, Train acc : 0.9601942820426745
Val loss : 0.32485611041386925, Val acc : 0.8993260979652404


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.80it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.99it/s]


For epoch :6
Train loss : 0.09634465450405055, Train acc : 0.971312070504213
Val loss : 0.4278204600016276, Val acc : 0.8909927646319071


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 47.55it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.58it/s]


For epoch :7
Train loss : 0.08326734948712282, Train acc : 0.9749265491452992
Val loss : 0.3546858588854472, Val acc : 0.9091742197672527


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 44.18it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 61.21it/s]


For epoch :8
Train loss : 0.053746381097942844, Train acc : 0.9850093482905983
Val loss : 0.38166950941085814, Val acc : 0.9094704190889994


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.43it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.12it/s]


For epoch :9
Train loss : 0.04581337863515712, Train acc : 0.9871417457221919
Val loss : 0.3952002783616384, Val acc : 0.9061088601748148


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 47.18it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.68it/s]


For epoch :10
Train loss : 0.04014209071054864, Train acc : 0.9890447905939869
Val loss : 0.3859871099392573, Val acc : 0.9073870857556661


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 47.51it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.79it/s]


For epoch :11
Train loss : 0.03343141161534203, Train acc : 0.9906517094017094
Val loss : 0.4024312913417816, Val acc : 0.8970062017440796


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.69it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 37.00it/s]


For epoch :12
Train loss : 0.03464703990393279, Train acc : 0.9905515491452992
Val loss : 0.4524066557486852, Val acc : 0.9047948479652405


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.52it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.84it/s]


For epoch :13
Train loss : 0.03516628217692368, Train acc : 0.9901421987093412
Val loss : 0.40912293990453086, Val acc : 0.9107844312985738


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.54it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.24it/s]


For epoch :14
Train loss : 0.03483876397705868, Train acc : 0.9900507478632479
Val loss : 0.4614772955576579, Val acc : 0.8972427646319071


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 47.03it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.21it/s]


For epoch :15
Train loss : 0.034690476118181, Train acc : 0.9906807414486877
Val loss : 0.464696991443634, Val acc : 0.9011847972869873


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.26it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.68it/s]


For epoch :16
Train loss : 0.03952682211310563, Train acc : 0.988477215807662
Val loss : 0.4295754909515381, Val acc : 0.9009005268414815


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 46.87it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.15it/s]


For epoch :17
Train loss : 0.03313745299760157, Train acc : 0.9906183226495726
Val loss : 0.4511947840452194, Val acc : 0.9006520350774129


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 47.00it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.45it/s]


For epoch :18
Train loss : 0.02802488631091248, Train acc : 0.9920539529914529
Val loss : 0.4852957288424174, Val acc : 0.9060850024223328


TRAINING: 100%|██████████| 117/117 [00:02<00:00, 45.00it/s]
EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 60.17it/s]

For epoch :19
Train loss : 0.03077683126569813, Train acc : 0.9912192841880342
Val loss : 0.44602895180384317, Val acc : 0.9016817768414815





In [46]:
test_loss,test_acc=test(model,test_dataloader,loss_fn,device)
test_loss,test_acc

EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 59.79it/s]


(0.41314339836438496, 0.9092815677324931)

# Using Early Stopping

In [47]:
!pip install pytorch-ignite

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cufft_cu12-11.2.1.3-py

In [101]:
def score_function(engine):
    val_loss = engine.state.metrics['loss']
    return -val_loss  # Ignite expects higher scores as 'better'


In [102]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy,Loss

trainer = create_supervised_trainer(model, optim, loss_fn, device=device)
evaluator = create_supervised_evaluator(model, metrics={'loss': Loss(loss_fn),'acc' : Accuracy()}, device=device)


In [103]:
from ignite.handlers import EarlyStopping

handler = EarlyStopping(
    patience=5,                   # epochs to wait for improvement
    score_function=score_function,
    trainer=trainer
)

# Attach handler to the evaluator; runs after each validation epoch
evaluator.add_event_handler(Events.COMPLETED, handler)


<ignite.engine.events.RemovableEventHandle at 0x7a97bd703dd0>

In [104]:
@trainer.on(Events.EPOCH_COMPLETED)
def run_validation(engine):
    evaluator.run(val_dataloader)

trainer.run(train_dataloader, max_epochs=100)


2025-08-06 09:38:14,382 ignite.handlers.early_stopping.EarlyStopping INFO: EarlyStopping: Stop training


State:
	iteration: 1170
	epoch: 10
	epoch_length: 117
	max_epochs: 100
	output: 0.009516171179711819
	batch: <class 'tuple'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [105]:
metrics = evaluator.state.metrics
print(metrics)
print(metrics['acc'])
print(metrics['loss'])     

{'loss': 0.39530458103760097, 'acc': 0.9004037685060565}
0.9004037685060565
0.39530458103760097


In [74]:
torch.save(model.state_dict(), 'cnn_gru_best.pth')

In [106]:
test_loss,test_acc=test(model,test_dataloader,loss_fn,device)
test_loss,test_acc

EVALUATING: 100%|██████████| 15/15 [00:00<00:00, 58.75it/s]


(0.35991743008295696, 0.9118857343991598)