In [1]:
from torch.nn import BCELoss

paragraph = """Abhishek
Waghchaure
Sai Vrundavan, flat no-703, Sr No-56/5B,
Behind Abhiruchi Mall, near Madhuli,
Sinhgad road, Vadgaon Bk., PUNE,
Maharashtra - 411041
Email: abhisw28@gmail.com
Mobile: 8668566528
Professional Summary
Data Scientist with a strong
foundation in data science, machine
learning, and analytics, reinforced by
hands-on experience in Java
development. Possessing an M.Tech
in Data Science and Analytics, skilled
in Python, data analysis, and
creating machine learning models.
Adept at translating complex data
sets into actionable insights and
committed to continuous learning
and development in the data science
field.
Technical Skills
Programming Languages: Python,
Java
Data Science & Analytics: Machine
Learning, Data Analytics, Data
Science, Natural Language
Processing, Computer Vision,
Business Analytics, Business
Intelligence, Pandas, NumPy
Web Development: HTML, CSS,
JavaScript, React JS
Frameworks & Tools: Spring Boot,
Hibernate, RESTful API, J2EE,
OpenCV, TensorFlow
Languages: Marathi, English, Hindi
Experience
Java Developer | Vinz Global
Jul 2022 - Dec 2023
Outsourced to NICE, developed backend using
Spring Microservices.
Upgraded legacy systems from Java Servlets
and JSPs to Spring and AngularJS.
Migrated SQL queries to HQL in the DAO layer.
Java Developer | Aventior Digital Pvt Ltd
Oct 2021 - Jun 2022
Developed and improved APIs for a healthcare
project.
Participated in the full SDLC including analysis,
design, implementation, testing, and
maintenance.
Utilized Java, Spring Boot, Hibernate, REST, JSP,
JavaScript, and JQuery.
Intern Java Developer | Coding Bit
Jun 2021 - Jul 2021
Worked on a variety of small projects to
enhance coding and development skills.
Education
M.Tech in Computer Science & Engineering - Data
Science and Analytics
MIT World Peace University, Pune, 2024
PG Diploma. CDAC
Institute for Advanced Computing and Software
Development, Akurdi, Pune, 2022
B.Tech in Computer Science & Engineering -
MIT AOE, Alandi
Projects
Skin Cancer Detection and Classification using Deep
Learning:
Developed a novel approach using CNN architectures
(DenseNet201, VGG16, Xception).
Achieved high accuracy in classifying skin lesions,
contributing to early diagnosis.
Automatic Number Plate Detection using OpenCV
and OCR:
Implemented a detection system using the
Haarcascade model and EasyOCR.
Converted number plate images into text format.
Diamond Price Prediction using Linear Regression:
Predicted diamond prices based on various features
using a linear regression model.
Conducted comprehensive data analysis and model
training."""

In [2]:
import nltk
from nltk.corpus import stopwords
import torch
import torch.nn as nn
import re
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm
import torch.optim as optim

In [3]:
device = torch.device = 'mps'

In [4]:
class text_data(Dataset):
    def __init__(self,data):
        super(text_data,self).__init__()
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

In [5]:
def preprocessing(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    corpus = []
    for sent in sentences:
        review = re.sub("[^a-zA-z]"," ", sent)
        review = review.lower()
        words = nltk.word_tokenize(review)
        words = [word for word in words if word not in set(stopwords.words('english'))]
        preprocessed_text = " ".join(words)
        corpus.append(preprocessed_text)
    return corpus

In [6]:
corpus = preprocessing(paragraph)
print(corpus)

['abhishek waghchaure sai vrundavan flat sr b behind abhiruchi mall near madhuli sinhgad road vadgaon bk pune maharashtra email abhisw gmail com mobile professional summary data scientist strong foundation data science machine learning analytics reinforced hands experience java development', 'possessing tech data science analytics skilled python data analysis creating machine learning models', 'adept translating complex data sets actionable insights committed continuous learning development data science field', 'technical skills programming languages python java data science analytics machine learning data analytics data science natural language processing computer vision business analytics business intelligence pandas numpy web development html css javascript react js frameworks tools spring boot hibernate restful api j ee opencv tensorflow languages marathi english hindi experience java developer vinz global jul dec outsourced nice developed backend using spring microservices', 'upgr

In [7]:
def embedding_generator(input_corpus, embedding_dim = 10, max_len = 20):
    all_words = list(set(word for sent in input_corpus for word in nltk.word_tokenize(sent)))
    # all_words = [words for sent in input_corpus for words in nltk.word_tokenize(sent)]
    words_to_idx = {word : idx for idx, word in enumerate(all_words)}
    
    # indexed_sentences = [[words_to_idx[word] for word in nltk.word_tokenize(sent) ]for sent in input_corpus]
    indexed_sentences = [[words_to_idx[word] for word in nltk.word_tokenize(sent) if word in words_to_idx] for sent in input_corpus]
    
    padded_sentences = [sent[:max_len] + [0] * (max_len - len(sent)) if len(sent) < max_len else sent[:max_len] for sent in indexed_sentences]
    
    
    embedding_layer = nn.Embedding( num_embeddings = len(words_to_idx), embedding_dim = embedding_dim)
    
    embedded_corpus = []
    for sentence in padded_sentences:
        inputs = torch.tensor(sentence, dtype =torch.long)
        outputs = embedding_layer(inputs)
        embedded_corpus.append(outputs)
        
    return embedded_corpus, words_to_idx
    
    # print(all_words)
    # print(words_to_idx)
    # print(indexed_sentences)
    # print(len(embedded_corpus[0]))
    # print(len(indexed_sentences[0]))

In [8]:
embedded_corpus, word_to_idx = embedding_generator(corpus)

In [9]:
vocab_size = len(word_to_idx)

In [10]:
input_size = 10
num_layers = 2
hidden_size = 32
batch_size = 6
sequence_length = 20
epochs = 10
lr = 0.001

import torch
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size, sequence_length):
        super(Generator, self).__init__()
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        # RNN layer for generating sequences
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        # Output layer to transform hidden states to vocabulary space
        self.output_layer = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.output_layer(out)
        return out  # Shape: [batch_size, seq_length, vocab_size]


class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, sequence_length):
        super(Discriminator, self).__init__()
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        # RNN layer for sequence processing
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        # Output layer for binary classification
        self.output_layer = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        # Take the last hidden state for binary classification
        out = self.output_layer(out[:, -1, :])  # Shape: [batch_size, 1]
        return self.sigmoid(out)

In [11]:
data_loader = DataLoader(embedded_corpus, batch_size = batch_size)

In [12]:
for data in data_loader:
    print(data.size())

torch.Size([6, 20, 10])
torch.Size([6, 20, 10])
torch.Size([5, 20, 10])


In [13]:
## Training Loop for discriminator
def train_gan(generator, discriminator, dataloader, num_epochs = epochs, lr = lr):
    criterion = BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr = lr)
    d_optimizer = optim.Adam(discriminator.parameters(), lr = lr)
    
    for epoch in range(num_epochs):
        epoch_d_loss =0.0
        epoch_g_loss=0.0
        total_batches =len(data_loader)
        
        pbar = tqdm(total =total_batches, unit = 'batch')
        for real_data in data_loader:
            real_data = real_data.to(device= device)
            
            # Train Descriminator
            d_optimizer.zero_grad()
            real_labels = torch.ones(real_data.size(0), 1).to(device=device)
            d_real_loss = criterion(discriminator(real_data), real_labels)
            
            # Fake Data Generation
            noise = torch.randn(real_data.size(0), generator.sequence_length, generator.hidden_size).to(device=device)
            fake_data = generator(noise)
            fake_labels = torch.zeros(fake_data.size(0),1).to(device=device)
            d_fake_loss =  criterion(discriminator(fake_data),fake_labels)
            
            #Combining and back propogating the descriminator loss
            d_loss = d_real_loss + d_fake_loss
            d_loss.backward()
            d_optimizer.step()
            
            #Train Generator
            g_optimizer.zero_grad()
            g_loss = criterion (discriminator(fake_data), real_labels)
            g_loss.backward()
            g_optimizer.step()
            
            epoch_d_loss = epoch_d_loss + d_loss.items()
            epoch_g_loss = epoch_g_loss + g_loss.items()
            
            pbar.desc(f'Epoch {epoch + 1}/{num_epochs}')
            pbar.set_postfix({'D Loss': d_loss.item(), 'G Loss': g_loss.item()})  # Display current losses
            pbar.update(1)  
            
        avg_d_loss = epoch_d_loss / total_batches  # Average discriminator loss
        avg_g_loss = epoch_g_loss / total_batches  # Average generator loss

        print(f"Epoch [{epoch+1}/{num_epochs}], Avg D Loss: {avg_d_loss:.4f}, Avg G Loss: {avg_g_loss:.4f}")  # Print average losses
        
        pbar.close()  # Close the progress bar
            
            


In [14]:
generator = Generator(input_size, num_layers,hidden_size, sequence_length ).to(device = device)
discriminator = Discriminator(input_size, hidden_size, num_layers).to(device = device)

In [15]:
train_gan(generator, discriminator, dataloader = data_loader,num_epochs = epochs, lr = lr)

  0%|          | 0/3 [00:00<?, ?batch/s]

RuntimeError: input.size(-1) must be equal to input_size. Expected 10, got 2