In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.downloader
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Text preprocessing

In [None]:
class Textpreprocessing:
  def __init__(self,text):
    self.text=text

  def words(self,str1):
    tokens=str1.split()[:4]
    str2=''

    for i in range(len(tokens)):
      if i==0:
        str2+=tokens[i]
      else:
        str2=str2+" "+tokens[i]

    return str2



  def stopword_remove(self,str1):
    str1=str1.lower()
    tokens=str1.split()
    str2=""
    stop_words=stopwords.words('english')
    for word in tokens:
      if not word in stop_words:
        str2+=word+' '
    return str2

  def url_remove(self,str1):
    str1=re.sub(r'http\S+', '', str1)
    str1=re.sub(r'www\S+', '', str1)
    return str1

  def clean_punctuation(self,str1):
    str1=re.sub(r'[^\w\s]','',str1)
    return str1

  def cleaningdigits(self,str1):
    str1=re.sub(r'[\d+]','',str1)
    return str1

  def lemmatization(self,str1):
    lemma=WordNetLemmatizer()
    str2=''
    tokens=str1.split()
    for word in tokens:
      store=lemma.lemmatize(word)
      str2+=store+' '
    return str2

  def preprocess(self):
    self.text=self.text.apply(self.words)
    self.text=self.text.apply(self.stopword_remove)
    self.text=self.text.apply(self.url_remove)
    self.text=self.text.apply(self.clean_punctuation)
    self.text=self.text.apply(self.cleaningdigits)
    self.text=self.text.apply(self.lemmatization)

    return self.text

In [None]:
def convert_to_number(label):
  if label=='spam':
    return 1
  return 0

In [None]:
url='https://drive.google.com/file/d/1xEyhcHjrjEo62k84kKu6mI9vttGtc-jV/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)
df=df.drop_duplicates()
df=df[df['Message'].notnull()]
df=df[df['Category'].notnull()]

print(df.head)

<bound method NDFrame.head of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5157 rows x 2 columns]>


In [None]:
text_preprocess=Textpreprocessing(df['Message'])
df['Message']=text_preprocess.preprocess()
document_size=len(df)



In [None]:
df['Numeric_label']=df['Category'].apply(convert_to_number)


In [None]:
new_model=gensim.downloader.load('glove-twitter-25')
new_model.most_similar("bad")
wv_size=len(new_model['bad'])

In [None]:
def create_word2vec_for_setence(text,max_word):

  results  = np.zeros(shape = (max_word,wv_size))
  #for i, documents in enumerate(text):
  for j, considered_word in list(enumerate(text.split())):
    try:
      results[j, :] = new_model[considered_word]
    except:
      results[j,:]=np.zeros((wv_size))
  return torch.tensor(results,dtype=torch.float32)

In [None]:
class MyDataset(Dataset):
  def __init__(self,df):
    super().__init__()
    self.texts=df['Message'].values
    self.labels=df['Numeric_label'].values

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]

    word2vec= create_word2vec_for_setence(text,4)
    label=torch.tensor(label,dtype=torch.float32)

    return word2vec,label

In [None]:
train_dataset,test_dataset=train_test_split(df, test_size=0.2, random_state=42)

train_my_dataset=MyDataset(train_dataset)
test_my_dataset=MyDataset(test_dataset)
train_dataloader=DataLoader(train_my_dataset,batch_size=32,shuffle=True)
test_dataloader=DataLoader(test_my_dataset,batch_size=32,shuffle=False)

In [None]:
class MyModel2(nn.Module):
  def __init__(self,input_dim=25,hidden_dim1=64,hidden_dim2=128,hidden_dim3=64,num_layers=1):
    super().__init__()
    self.input_dim=input_dim
    self.hidden_dim1=hidden_dim1
    self.hidden_dim2=hidden_dim2
    self.hidden_dim3=hidden_dim3
    self.num_layers=num_layers=1
    self.output_dim=1

    self.rnn = nn.RNN(self.input_dim, self.hidden_dim1, num_layers, batch_first=True)

    self.linear_layer1=nn.Linear(self.hidden_dim1,self.hidden_dim2)
    self.linear_layer2=nn.Linear(self.hidden_dim2,self.hidden_dim3)
    self.final_layer=nn.Linear(self.hidden_dim3,self.output_dim)

    self.sigmoid=nn.Sigmoid()

  def forward(self,x):
    batch_size=x.shape[0]
    h0 = torch.randn(self.num_layers, batch_size,self.hidden_dim1)
    output, hn = self.rnn(x,h0)
    hn = hn.permute(1, 0, 2).contiguous()

    x=hn

    x=torch.mean(x,1)
    #x.requires_grad=True

    #x=x.view(batch_size,-1)
    #x=torch.conacte()
    x=self.linear_layer1(x)
    x=self.sigmoid(x)
    x=self.linear_layer2(x)
    x=self.sigmoid(x)
    x=self.final_layer(x)
    #print('requires_grad',x.requires_grad)
    x=self.sigmoid(x)

    return x


    #return x.view(batch_size, self.output_dim)




In [None]:
model2=MyModel2()

In [None]:
optimizer=torch.optim.Adam(model2.parameters(),lr=0.01)

In [None]:
criterion=nn.BCELoss(reduction='mean')

In [None]:
for epoch in range(10):
  overall_accuracy=0
  for x,y in train_dataloader:
    batch_size=x.shape[0]

    predicted_y=model2(x)

    y=y.view(batch_size,1)

    loss=criterion(predicted_y,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    y_true=y.detach().numpy()

    y_pred=predicted_y.detach().numpy() >0.5



    accuracy= accuracy_score(y_true,y_pred)
    overall_accuracy +=accuracy*batch_size


  print(f'Epoch: {epoch} --> Accuracy {overall_accuracy/len(train_my_dataset)}')



torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 

In [None]:
for epoch in range(10):
  overall_accuracy=0
  for x,y in test_dataloader:
    batch_size=x.shape[0]

    predicted_y=model2(x)

    y=y.view(batch_size,1)

    loss=criterion(predicted_y,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    y_true=y.detach().numpy()
    y_pred=predicted_y.detach().numpy() >0.5



    accuracy= accuracy_score(y_true,y_pred)
    overall_accuracy +=accuracy*batch_size


  print(f'Epoch: {epoch} --> Accuracy {overall_accuracy/len(test_my_dataset)}')

torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([8, 1, 64])
Epoch: 0 --> Accuracy 0.8895348837209303
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
