In [2]:
import numpy as np
import pandas as pd
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df_surface = pd.read_csv("surface_web_Market.csv")

df_dark = pd.read_csv("DarkWeb_Covid_Market.csv")

df_surface['target'] = [0]*len(df_surface)
df_dark['target'] = [1]*len(df_dark)

list_df = [df_surface, df_dark]
df = pd.concat(list_df)


In [4]:
df.head()

Unnamed: 0,Text,Main_Class,target
0,"Lucira Check It Single-Use COVID-19 Test, The ...",Market,0
1,Lucira Check It Single-Use COVID-19 Test. 3.9 ...,Market,0
2,Lucira Check. 556 ratings Currently unavailable.,Market,0
3,Lucira Check It Single-Use — Ages 18+.,Market,0
4,CLINITEST Rapid Covid-19 Antigen Self-Test: Co...,Market,0


In [5]:
#df["Main_Class"].unique()
df["target"].unique()

array([0, 1])

In [6]:
#df.groupby("Main_Class").count()
df.groupby("target").count()

Unnamed: 0_level_0,Text,Main_Class
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,630,630
1,630,630


In [7]:
df.index=range(len(df))

**WORD-TOKENIZATION**

In [8]:
nltk.download('punkt')

df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row["Text"]), axis=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**POS-TAGGIN(PART-OF-SPEECH TAGGING)**

In [9]:

nltk.download('averaged_perceptron_tagger')

df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: pos_tag(x))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
df['tokenized_sents']

0       [(Lucira, NNP), (Check, NNP), (It, PRP), (Sing...
1       [(Lucira, NNP), (Check, NNP), (It, PRP), (Sing...
2       [(Lucira, NNP), (Check, NNP), (., .), (556, CD...
3       [(Lucira, NNP), (Check, NNP), (It, PRP), (Sing...
4       [(CLINITEST, NNP), (Rapid, NNP), (Covid-19, NN...
                              ...                        
1255    [(-covid19-, NN), (CERTIFICATES, NNS), (ARE, V...
1256    [(CERTIFICATES, NNS), (ARE, VBP), (ISSUED, NNP...
1257    [(Covid-19, JJ), (Vaccine, NNP), (Cards, NNP),...
1258    [(COVID-19, NNP), (,, ,), (Sputnik, NNP), (V, ...
1259    [(COVID, NNP), (VACCINE, NNP), (CARDS, NNP), (...
Name: tokenized_sents, Length: 1260, dtype: object

**TO-LOWER-CASE**

In [11]:
def listOfLists(L):
  newL=[]
  for t in L:
    newL.append(list(t))
  return newL

def toLowerCase(L):
  for l in L:
    l[0]=l[0].lower()
  return L

df['tokenized_sents']=df['tokenized_sents'].apply(lambda x: listOfLists(x))
df['tokenized_sents']=df['tokenized_sents'].apply(lambda x: toLowerCase(x))

In [12]:
df["tokenized_sents"]

0       [[lucira, NNP], [check, NNP], [it, PRP], [sing...
1       [[lucira, NNP], [check, NNP], [it, PRP], [sing...
2       [[lucira, NNP], [check, NNP], [., .], [556, CD...
3       [[lucira, NNP], [check, NNP], [it, PRP], [sing...
4       [[clinitest, NNP], [rapid, NNP], [covid-19, NN...
                              ...                        
1255    [[-covid19-, NN], [certificates, NNS], [are, V...
1256    [[certificates, NNS], [are, VBP], [issued, NNP...
1257    [[covid-19, JJ], [vaccine, NNP], [cards, NNP],...
1258    [[covid-19, NNP], [,, ,], [sputnik, NNP], [v, ...
1259    [[covid, NNP], [vaccine, NNP], [cards, NNP], [...
Name: tokenized_sents, Length: 1260, dtype: object

**STOPWORDS-REMOVAL**

In [13]:
nltk.download('stopwords')

import nltk
nltk.download('omw-1.4')

stop_list = stopwords.words('english')+list(string.punctuation)+[" "]+[""] #noise removal:insieme alle stopwords viene eliminata la punteggiatura

print(stop_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:

def removeSublist(a,b):
  for el in b:
    a.remove(el)

#df['tokenized_sents'].apply(lambda x: removeSublist(x,[couple for couple in x if not(set(couple[0]).isdisjoint(stop_list))]))

**NOISE-REMOVAL**

In [15]:
def removeSublist(a,b):
  for el in b:
    a.remove(el)

noises_list=list(string.punctuation)+[" "]+[""]

df['tokenized_sents'].apply(lambda x: removeSublist(x,[couple for couple in x if not(set(couple[0]).isdisjoint(noises_list))]))

0       None
1       None
2       None
3       None
4       None
        ... 
1255    None
1256    None
1257    None
1258    None
1259    None
Name: tokenized_sents, Length: 1260, dtype: object

In [16]:
df["tokenized_sents"]

0       [[lucira, NNP], [check, NNP], [it, PRP], [test...
1       [[lucira, NNP], [check, NNP], [it, PRP], [test...
2       [[lucira, NNP], [check, NNP], [556, CD], [rati...
3       [[lucira, NNP], [check, NNP], [it, PRP], [—, N...
4       [[clinitest, NNP], [rapid, NNP], [antigen, NNP...
                              ...                        
1255    [[certificates, NNS], [are, VBP], [issued, NNP...
1256    [[certificates, NNS], [are, VBP], [issued, NNP...
1257    [[vaccine, NNP], [cards, NNP], [european, NNP]...
1258    [[sputnik, NNP], [v, NNP], [vaccine, NN], [150...
1259    [[covid, NNP], [vaccine, NNP], [cards, NNP], [...
Name: tokenized_sents, Length: 1260, dtype: object

**LEMMATIZATION**

In [17]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 0

In [18]:
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 0

lemmatizer = WordNetLemmatizer()

def lemmatizeToken(L):
  for i in range(len(L)):
    if(get_wordnet_pos(L[i][1])!=0):
      L[i][0]=lemmatizer.lemmatize(L[i][0],pos=get_wordnet_pos(L[i][1]))
  return L

df["tokenized_sents"]=df['tokenized_sents'].apply(lambda x: lemmatizeToken(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
lemmatizer.lemmatize("going",pos=get_wordnet_pos("VBZ"))

'go'

In [20]:
df["tokenized_sents"]

0       [[lucira, NNP], [check, NNP], [it, PRP], [test...
1       [[lucira, NNP], [check, NNP], [it, PRP], [test...
2       [[lucira, NNP], [check, NNP], [556, CD], [rati...
3       [[lucira, NNP], [check, NNP], [it, PRP], [—, N...
4       [[clinitest, NNP], [rapid, NNP], [antigen, NNP...
                              ...                        
1255    [[certificate, NNS], [be, VBP], [issued, NNP],...
1256    [[certificate, NNS], [be, VBP], [issued, NNP],...
1257    [[vaccine, NNP], [card, NNP], [european, NNP],...
1258    [[sputnik, NNP], [v, NNP], [vaccine, NN], [150...
1259    [[covid, NNP], [vaccine, NNP], [card, NNP], [s...
Name: tokenized_sents, Length: 1260, dtype: object

CONCATENAZIONE DELLE COPPIE TOKEN-TAG OPPURE TOKEN-ENTITY

In [21]:

df["tokenized_sents"]=df["tokenized_sents"].apply(lambda x: ' '.join([el[0]+el[1] for el in x]))

In [22]:
df.index=range(len(df))

In [23]:
df.loc[970,"tokenized_sents"]

'compliesNNS withIN dataNNS protectionNN requirementNNS andCC workVBZ withIN theDT fhirNNP standardNN'

In [24]:
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
#
# Create sample set of documents
#
docs = np.array(df["tokenized_sents"])

#
# Fit the bag-of-words model
#
bag = vectorizer.fit_transform(docs)
#
# Get unique words / tokens found in all the documents. The unique words / tokens represents
# the features
#
#print(vectorizer.get_feature_names())
#
# Associate the indices with each unique word
#
#print(vectorizer.vocabulary_)
#
# Print the numerical feature vector
#
#print(bag.toarray())

In [25]:
X=bag.toarray()
print(X.shape)

(1260, 2531)


In [26]:
Y=np.empty((len(df),1))
Y

array([[6.93007348e-310],
       [6.93007348e-310],
       [2.63107980e-316],
       ...,
       [6.92944663e-310],
       [6.92944661e-310],
       [6.92944661e-310]])

In [27]:

authors=df["target"].unique()

targets=np.array(df["target"])


for i in range (len(df)):
  Y[i]=np.where(authors == targets[i])[0][0]

Y=Y.astype("int")
Y

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [28]:
X.shape

(1260, 2531)

In [29]:
Y.shape

(1260, 1)

In [30]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.4)

In [31]:
print(X_train.shape,X_test.shape)
print(Y_train.shape,Y_test.shape)

(756, 2531) (504, 2531)
(756, 1) (504, 1)


In [32]:
print("Xtrain",type(X_train))
print("Ytrain",type(Y_train))
print("Xtest",type(X_test))
print("Ytest",type(Y_test))
print("Xtrain",X_train.dtype)
print("Ytrain",Y_train.dtype)
print("Xtest",X_test.dtype)
print("Ytest",Y_test.dtype)


Xtrain <class 'numpy.ndarray'>
Ytrain <class 'numpy.ndarray'>
Xtest <class 'numpy.ndarray'>
Ytest <class 'numpy.ndarray'>
Xtrain int64
Ytrain int64
Xtest int64
Ytest int64


**Trasformazione degli array numpy in tensori pytorch**

In [33]:
import torch

X_train = torch.from_numpy(X_train).float()
Y_train = torch.from_numpy(Y_train).float()

X_test = torch.from_numpy(X_test).float()
Y_test = torch.from_numpy(Y_test).float()

In [34]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

torch.Size([756, 2531])
torch.Size([756, 1])
torch.Size([504, 2531])
torch.Size([504, 1])


In [35]:
len(X_train[0])

input_layer_neurons=len(X_train[0])
output_layer_neurons=3

print(input_layer_neurons)
print(output_layer_neurons)

2531
3


In [36]:
import torch.nn as nn
import torch.nn.functional as fn


class Net(nn.Module):
  def __init__(self): 
    super().__init__()  

    self.fc1=nn.Linear(input_layer_neurons,124)  
    self.fc2=nn.Linear(124,64)     #Hidden Layer 1
    #self.fc3=nn.Linear(64,64)     #Hidden Layer 2
    self.fc4=nn.Linear(64,output_layer_neurons)     #Output Layer
  
  def forward(self,x):
    x=fn.relu(self.fc1(x))
    x=fn.relu(self.fc2(x))
    #x=fn.relu(self.fc3(x))
    x=self.fc4(x)

    #return x
    return fn.log_softmax(x,dim=1)  

class Leo(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, 8000)
        self.fc2 = torch.nn.Linear(8000, 4000)
        self.fc3 = torch.nn.Linear(4000, 2000)
        self.synth_sem_linear = nn.Linear(2000, output_dim)


    def forward(self, x):        
        x = F.dropout(F.relu(self.fc1(x)), p=0.1)
        x = F.dropout(F.relu(self.fc2(x)), p=0.1)
        x = F.dropout(F.relu(self.fc3(x)), p=0.1)
        x_tot = self.synth_sem_linear(x)
        return x_tot   

In [37]:
net=Net() #istanziazione della rete neurale
net.cuda()

Net(
  (fc1): Linear(in_features=2531, out_features=124, bias=True)
  (fc2): Linear(in_features=124, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=3, bias=True)
)

In [38]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 10

train_data = TensorDataset(X_train,Y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test,Y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# FASE DI TRAINING

In [39]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  print("gpu available")
else:
  device = torch.device("cpu")

gpu available


In [40]:
torch.cuda.device(0)

<torch.cuda.device at 0x7f8e59827c18>

In [41]:
import torch.optim as optim

loss_function=nn.CrossEntropyLoss()

optimizer=optim.Adam(net.parameters(),lr=0.001)

for epoch in range(10):
  for data in train_data:
    X, y = data 
    net.zero_grad()
    output=net(X.view(-1,input_layer_neurons).cuda())  
    loss=fn.nll_loss(output,y.long().cuda())  
    loss.backward() 
    optimizer.step() 
  print(loss)

tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)


In [42]:
correct=0 
total=0 

with torch.no_grad(): 
  for data in test_data:
    X, y = data
    output=net(X.view(-1,input_layer_neurons).cuda())
    #print(y)
    for idx, i in enumerate(output):
      #print(torch.argmax(i),y[idx])
      if(torch.argmax(i)==y[idx]):
        correct+=1
      total+=1

print("ACCURACY: ",round(correct/total,3)*100,"%")

ACCURACY:  98.6 %


10- 72.5%
3- 77.5