In [8]:
import pandas as pd
from transformers import BertTokenizerFast, TFBertForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import os

In [11]:
def load_data():
    component = pd.read_excel("../dataset/smells.xlsx")
    component = component.drop(['Project','Version','Smell'],axis=1)
    df = pd.DataFrame()

    path = "../../projects"
    projects = []

    # Itera su tutti i file e le cartelle nella cartella specificata
    for item in os.listdir(path):
        # Se l'elemento nella cartella è una cartella, aggiungi il nome alla lista
        if os.path.isdir(os.path.join(path, item)):
            projects.append(item)
    # Stampa la lista di nomi dei progetti
    print(projects)

    possible_subfolders = ["src/java", "src/main", "src/main/java", "src"]
    #classes = []

    for k in range(len(projects)):
        print(projects[k])
        for i in range(len(component)):
            line = component.loc[i,'ComponentName'].strip()
            line = line.replace(".","/") + ".java"
            for subfolder in possible_subfolders:
                full_path = os.path.join(path, projects[k], subfolder, line)
                if os.path.exists(full_path):
                    with open(full_path, "r") as f:
                        contenuto = f.read()
                    #classes.append(contenuto)
                    temp_df = pd.DataFrame({
                        'Component': [contenuto],
                        'CDSBP': [component.loc[i,'CDSBP']],
                        'CC': [component.loc[i,'CC']],
                        'LC': [component.loc[i,'LC']],
                        'LZC': [component.loc[i,'LZC']],
                        'RB': [component.loc[i,'RB']],
                        'SC': [component.loc[i,'SC']]
                    })
                    df = pd.concat([df, temp_df], ignore_index=True)
                    break # Esci dal ciclo for se hai trovato il file

    #print(len(classes))
    #print(len(df))
    #print(df)
    # final_df = pd.DataFrame()
    # final_df['component'] = df['Component']
    # final_df['labels'] = df.iloc[:, 1:].values.tolist()

    return df

In [None]:
dataframe = load_data()

In [14]:
dataframe

Unnamed: 0,Component,CDSBP,CC,LC,LZC,RB,SC
0,/*\n * Licensed to the Apache Software Founda...,1,0,0,0,0,0
1,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,1
2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,1
3,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,1
4,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,1
...,...,...,...,...,...,...,...
2512,"/*\n * The Apache Software License, Version 1....",0,0,0,0,0,1
2513,"/*\n * The Apache Software License, Version 1....",0,0,0,0,0,1
2514,"/*\n * The Apache Software License, Version 1....",0,0,0,0,0,1
2515,"/*\n * The Apache Software License, Version 1....",0,0,0,0,0,1


In [15]:
# Carica il tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Carica il modello
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Creazione X e y. X = dataset senza le etichette, y = etichette
dataframe2 = dataframe.drop(columns=['CDSBP','CC','LC','LZC','RB','SC'])
y = dataframe[['CDSBP','CC','LC','LZC','RB','SC']]

In [17]:
# Divisione in train, validation e test set
# X_train, X_test, Y_train, Y_test = train_test_split(dataframe2, y, test_size=0.33, random_state=1)
X_train, X_val_test, Y_train, Y_val_test = train_test_split(dataframe2, y, test_size=0.4, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=1)

In [18]:
l1 = [str(i) for i in X_train['Component'].tolist()]
l2 = [str(i) for i in X_val['Component'].tolist()]
l3 = [str(i) for i in X_test['Component'].tolist()]

In [19]:
X_train_tokenized = tokenizer(l1, padding=True, truncation=True, max_length=512, return_tensors='tf')
X_val_tokenized = tokenizer(l2, padding=True, truncation=True, max_length=512, return_tensors='tf')
X_test_tokenized = tokenizer(l3, padding=True, truncation=True, max_length=512, return_tensors='tf')
print("FATTA TOKENIZZAZIONE")

FATTA TOKENIZZAZIONE


In [20]:
# Estrai le sequenze di token e le maschere di attenzione
X_train_input_ids = X_train_tokenized['input_ids']
X_train_attention_mask = X_train_tokenized['attention_mask']

X_val_input_ids = X_val_tokenized['input_ids']
X_val_attention_mask = X_val_tokenized['attention_mask']

X_test_input_ids = X_test_tokenized['input_ids']
X_test_attention_mask = X_test_tokenized['attention_mask']

In [21]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
history = model.fit(
    x=X_train_input_ids,
    y=Y_train,
    batch_size=32,
    epochs=1,
    validation_data=(X_val_input_ids, Y_val),
    verbose=1,
    callbacks=[EarlyStopping(patience=3)],
    )

: 

: 

In [None]:
# Aggiungi un layer di output con funzione di attivazione softmax
model.add(Dense(6, activation='softmax'))