In [1]:
import torch
from torch import  nn
import numpy as np
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset,DataLoader
import pandas as pd
from tensorflow.keras.layers import Embedding,Dense, Activation
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from collections import Counter
from keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/balanced dataset.csv')
inputs = df['url']
labels = df['type']
labels

0         0
1         0
2         1
3         1
4         1
         ..
426051    1
426052    1
426053    0
426054    0
426055    0
Name: type, Length: 426056, dtype: int64

In [8]:
tokenizer = Tokenizer(num_words=1000000,oov_token='<OOV>')
tokenizer.fit_on_texts(inputs)
word_index = tokenizer.word_index
word_index['facebook']

76

In [9]:
sequences = tokenizer.texts_to_sequences(inputs)
len(word_index)


388968

In [10]:
lengths = [len(sublist) for sublist in sequences]

max_length = max(lengths)
min_length=min(lengths)
print(max_length)
print(min_length)

244
1


In [11]:
features = pad_sequences(sequences,maxlen=244)
features = features.astype(float)

In [12]:
split_frac = 0.8
split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(remaining_x)*0.1)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

train_x=train_x[:-44]
train_y=train_y[:-44]
val_x=val_x[:-21]
val_y=val_y[:-21]
test_x=test_x[:-91]
test_y=test_y[:-91]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(340800, 244) 
Validation set: 	(8500, 244) 
Test set: 		(76600, 244)


In [13]:
train_data = TensorDataset(torch.from_numpy(np.asarray(train_x)), torch.from_numpy(np.asarray(train_y)))
valid_data = TensorDataset(torch.from_numpy(np.asarray(val_x)), torch.from_numpy(np.asarray(val_y)))
test_data = TensorDataset(torch.from_numpy(np.asarray(test_x)), torch.from_numpy(np.asarray(test_y)))


train_loader = DataLoader(train_data, shuffle=True, batch_size=100)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=100)
test_loader = DataLoader(test_data, shuffle=True, batch_size=100)

In [14]:
Features,Labels=next(iter(train_loader))

In [15]:
train_on_gpu=torch.cuda.is_available()

In [16]:
class malicious(nn.Module):
  def __init__(self,vocab_size,output_size,embedding_dim,hidden_dim,n_layers,drop_prob=0.5):
    super(malicious,self).__init__()
    self.output_size=output_size
    self.n_layers=n_layers
    self.hidden_dim=hidden_dim
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.lstm=nn.LSTM(embedding_dim,hidden_dim,n_layers,dropout=drop_prob,batch_first=True)
    self.dropout = nn.Dropout(0.5)
    self.fc=nn.Linear(hidden_dim,output_size)

  def forward(self,x,hidden):
    batch_size=x.size(0)
    x=x.long()
    embeds=self.embedding(x)
    lstm_out,hidden=self.lstm(embeds,hidden)
    lstm_out=lstm_out[:, -1, :] # getting the last time step output
    output=self.dropout(lstm_out)
    output=F.sigmoid(self.fc(output))
    return output,hidden

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
    return hidden

In [17]:
vocab_size=len(word_index)+1 # +1 for the 0 padding + our word tokens
output_size=1
embedding_dim=512
hidden_dim=256
n_layers=2

model=malicious(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)


print(model)

malicious(
  (embedding): Embedding(388969, 512)
  (lstm): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [18]:
criterion=nn.BCELoss()
optimizer= Adam(model.parameters(),lr=0.001)
batch_size= 100

In [20]:
resume = torch.load('/content/drive/MyDrive/controlled3.pth')

In [21]:
model.load_state_dict(resume['state_dict'])

<All keys matched successfully>

In [58]:
import string
def add_spaces_around_punctuation(input_list):
    translation_table = str.maketrans({key: f' {key} ' for key in string.punctuation})
    result_list = [s.translate(translation_table) for s in input_list]
    result_list = [s.split() for s in result_list]

    return result_list

def pad_features(reviews_ints, seq_length):
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    return features

def predict(feature_tensor):
  with torch.no_grad():
    model.cuda()
    h = model.init_hidden(feature_tensor.size(0))
    model.eval()
    h = tuple([each.data for each in h])
    if(train_on_gpu):
      feature_tensor= feature_tensor.cuda()
    output,h = model(feature_tensor,h)
    pred = torch.round(output.squeeze())
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    if(pred.item()==0):
      print("Not Malicious")
    else:
      print("Malicious")

url=[input("Enter Url: ")]
output_list = add_spaces_around_punctuation(url)

reviews_ints = []
for review in output_list:
    reviews_ints.append([word_index.get(word, 0) for word in review])

seq_length = 326
features = pad_features(reviews_ints, seq_length=seq_length)
assert len(features) == len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."
feature_tensor=torch.from_numpy(features)
predict(feature_tensor)

Enter Url: linkedin.com/in/abiral-bhattarai-a76247256/
Prediction value, pre-rounding: 0.412903
Not Malicious
