In [7]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
#from nltk.corpus import stopwords
import nltk
import re
from tabulate import tabulate
from tqdm import trange
import os

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [1]:
import platform

os = platform.system()

match os.lower():
    case "darwin":
        path = "/Users/antoniobaio/Desktop/Progetti/ProgettiDS/config.json"
    case "linux":
        path = "/home/antonet/vscode/ProgettiDS/config.json"
    case "windows":
        path = "AGGIUNGI PATH"
        
print("Actual path: " + path)

Actual path: /Users/antoniobaio/Desktop/Progetti/ProgettiDS/config.json


In [73]:
#verificare la disponibilità di cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [74]:
df = pd.read_csv(path)
df.head(15)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [75]:
# definisco una funzione lambda che assegna il valore 0 se sentiment è negativo, 1 altrimenti
df['class_index'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head(15)

Unnamed: 0,review,sentiment,class_index
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
5,"Probably my all-time favorite movie, a story o...",positive,1
6,I sure would like to see a resurrection of a u...,positive,1
7,"This show was an amazing, fresh & innovative i...",negative,0
8,Encouraged by the positive comments about this...,negative,0
9,If you like original gut wrenching laughter yo...,positive,1


In [76]:
#df = df.dropna()
df['class_index'] = df['class_index'].astype(int)

In [80]:
def shuffle(df, n=3, axis=0):     
    df = df.copy()
    random_states = [2,42,4]
    for i in range(n):
        df = df.sample(frac=1,random_state=random_states[i]) # mischio il dataframe 
    return df

df = shuffle(df)
df

Unnamed: 0,review,sentiment,class_index
36457,Come on Tina Fey you can do better then this. ...,negative,0
48521,This is a very beautiful and almost meditative...,positive,1
5586,What an embarassment...This doesnt do justice ...,negative,0
40152,"To begin with, I really love Lucy. Her TV show...",negative,0
9437,I haven't seen this film in years so my knowle...,positive,1
...,...,...,...
16180,I can understand how fans of filmmaker Roman P...,positive,1
46643,Rita Hayworth lights up the screen in this fun...,positive,1
43178,The operative rule in the making of this film ...,negative,0
17408,I expected this to be a lot better. I love Tim...,negative,0


In [82]:
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) #Comando che permette di cambiare tutti i caratteri tranne :a-z, A-Z, ".", "?", "!", "," con uno spazio 

    text = re.sub(r"http\S+", "",text) #Rimozione dei link 
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Rimozione dei tag HTML
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Comando che permette di rimuovere i segni definiti sopra come punteggiatura
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #Rimozione di tutte le stopword
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" 
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Comando che rimuove quelle definite sopra come emojis
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antoniobaio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
df['review'] = df['review'].apply(lambda x: clean_text(x))

In [85]:
df.head()

Unnamed: 0,review,sentiment,class_index
36457,come tina fey better soon movie started knew w...,negative,0
48521,beautiful almost meditative film hardly dialog...,positive,1
5586,embarassmentthis doesnt justice original awful...,negative,0
40152,"begin with, really love lucy tv show still mak...",negative,0
9437,seen film years knowledge little rusty remembe...,positive,1


In [86]:
# definisco la percentuale di campioni da includere nel train_set
train_size = 0.75

# calcolo il numero di campioni da includere nel train_set e nel test_set
train_len = int(len(df) * train_size)
test_len = len(df) - train_len

# estraggo in modo casuale i campioni da includere nel train_set
train_df = df.sample(n=train_len, random_state=42)

# escludo dal test_set i campioni presenti nel train_set
test_df = df.drop(train_df.index)

print('train lenght:',len(train_df))
print(train_df.groupby(['class_index'])['review'].count())
print('test lenght:',len(test_df))
print(test_df.groupby(['class_index'])['review'].count())

print("Train set:")
print(train_df.head())
print("Test set:")
print(test_df.head())


train lenght: 37500
class_index
0    18872
1    18628
Name: review, dtype: int64
test lenght: 12500
class_index
0    6128
1    6372
Name: review, dtype: int64
Train set:
                                                  review sentiment  \
19398  disgusting joke supposed moviefrom poster look...  negative   
21772  extremely bad one long train wreck last episod...  negative   
45259  tamara anderson family moving again, itinerant...  negative   
25972  number things correct, although important sinc...  positive   
28799  seems sensei seagal getting moralising less ac...  negative   

       class_index  
19398            0  
21772            0  
45259            0  
25972            1  
28799            0  
Test set:
                                                  review sentiment  \
5586   embarassmentthis doesnt justice original awful...  negative   
1433   end review cache, wrote intrigued haneke film ...  positive   
10119  name cult movie often given films continue scr...  negat

In [87]:
sentences = train_df.review
labels = list(train_df.class_index)

In [90]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [91]:
index=0
print(' Original: ', list(sentences)[index])
table = np.array([tokenizer.tokenize(list(sentences)[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(list(sentences)[index]))]).T
print(tabulate(table,headers = ['Tokens', 'Token IDs'],tablefmt = 'fancy_grid'))

 Original:  disgusting joke supposed moviefrom poster looked like cute movie disappointment heck male lead looks like old retarded retired reject cop cop tell man act go back copno screen presence show bare ass so, mel gibson,, hell put filmmaker business guy business making movie seriously doubt women gay men find attractive whoever cast film talent hack cast talent hacks lead great us white guys alway getting asian women ugly white guy dean cain brad pitt white boyfriend asian women like ugly white guys black guys see get must low self esteem br br hot girl act movie kate hollidaywhy one hot white chick among rest ugly asian chicks think hot act br br two actors movie host poetry end movie one hot white chick massage house tl young kate holliday leads movie br br asia character ridiculous looked like trying hard kind ghetto sexy black girlkey word trying br br gina act hot enough physically kind role need play character roles humble self presentation br br think actress gina hirazumi

In [92]:
max_len = 0
l=[]
for sent in list(sentences):

    #Tokenizza il testo e aggiunge i tokens `[CLS]` e `[SEP]`
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    #Aggiorna la lunghezza massima delle frasi presenti nel dataset
    l.append(len(input_ids))
    max_len = max(max_len, len(input_ids))

if max_len>512:
  max_len=512
avg_len=int(sum(l)/len(l))

print('Max sentence length: ', max_len)
print('Avg sentence length: ', avg_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  512
Avg sentence length:  152


In [96]:
MAX_LEN = 128
#Tokenizza tutte le frasi e mappa i tokens con i loro IDs
input_ids = []
attention_masks = []

for sent in sentences:
    # Quello che `encode_plus` farà:
    #   1. Tokenizza la frase
    #   2. Aggiunge il token `[CLS]` all'inizio della frase
    #   3. Aggiunge il token `[SEP]` alla fine della frase
    #   4. Mappa il token con il loro ID
    #   5. Esegue il padding o tronca la frase affinche la sua lunghezza sia pari a `max_length`
    #   6. Crea le attention masks per il token [PAD]
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True,      #Aggiunge i tokens '[CLS]' e '[SEP]'
                        max_length = MAX_LEN,           #Setta la lunghezza massima
                        pad_to_max_length = True,       #Se necessaio esegue il padding
                        return_attention_mask = True,   #Costruisce le attn. masks
                        return_tensors = 'pt',          #Restituisce un tensore di pytorch
                   )
    
    #Aggiunge la frase codificata alla lista degli input   
    input_ids.append(encoded_dict['input_ids'])
    
    #E aggiunge le attention mask alla lista (semplice distinzione tra padding o meno)
    attention_masks.append(encoded_dict['attention_mask'])

#Converte la lista in un tensore
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.