In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
import os

def load_data():
    component = pd.read_csv("../dataset/complete_dataset.csv")
    #component = component.drop(['Project','Version','Smell'],axis=1)
    df = pd.DataFrame()

    path = "../../projects"
    projects = []

    # Itera su tutti i file e le cartelle nella cartella specificata
    for item in os.listdir(path):
        # Se l'elemento nella cartella è una cartella, aggiungi il nome alla lista
        if os.path.isdir(os.path.join(path, item)):
            projects.append(item)
    # Stampa la lista di nomi dei progetti
    print(projects)

    possible_subfolders = ["src/java", "src/main", "src/main/java", "src"]
    #classes = []

    for k in range(len(projects)):
        print(projects[k])
        for i in range(len(component)):
            line = component.loc[i,'ComponentName'].strip()
            line = line.replace(".","/") + ".java"
            for subfolder in possible_subfolders:
                full_path = os.path.join(path, projects[k], subfolder, line)
                if os.path.exists(full_path):
                    with open(full_path, "r") as f:
                        contenuto = f.read()
                    # classes.append(contenuto)
                    temp_df = pd.DataFrame({
                        'Project_name': projects[k], 
                        'Component': [contenuto],
                        'CDSBP' : [component.loc[i, 'CDSBP']],
                        'CC': [component.loc[i,'CC']],
                        'LC': [component.loc[i,'LC']],
                        'LZC': [component.loc[i,'LZC']],
                        'RB': [component.loc[i,'RB']],
                        'SC': [component.loc[i,'SC']]
                    })
                    df = pd.concat([df, temp_df], ignore_index=True)
                    break # Esci dal ciclo for se hai trovato il file
    return df

In [3]:
path = "../../projects"
projects = []
code_smells = ['CDSBP', 'CC', 'LC', 'LC', 'LZC', 'RB', 'SC']
# Itera su tutti i file e le cartelle nella cartella specificata
for item in os.listdir(path):
    # Se l'elemento nella cartella è una cartella, aggiungi il nome alla lista
    if os.path.isdir(os.path.join(path, item)):
       projects.append(item)
    # Stampa la lista di nomi dei progetti
print(projects)

['ant-ivy-2.0.0-alpha2', 'ant-rel-1.8.3', 'cassandra-cassandra-1.0.0', 'elasticsearch-v0.19.0', 'hadoop-release-0.6.0', 'hive-release-0.9.0', 'hsqldb-2.2.8', 'karaf-karaf-2.3.0', 'lucene-releases-lucene-solr-3.6.0', 'manifold-cf-release-0.6', 'nutch-release-1.4', 'pig-release-0.8.0', 'qpid-0.14', 'struts-STRUTS_2_3_4', 'xerces2-j-Xerces-J_2_3_0']


In [4]:
df = load_data()

['ant-ivy-2.0.0-alpha2', 'ant-rel-1.8.3', 'cassandra-cassandra-1.0.0', 'elasticsearch-v0.19.0', 'hadoop-release-0.6.0', 'hive-release-0.9.0', 'hsqldb-2.2.8', 'karaf-karaf-2.3.0', 'lucene-releases-lucene-solr-3.6.0', 'manifold-cf-release-0.6', 'nutch-release-1.4', 'pig-release-0.8.0', 'qpid-0.14', 'struts-STRUTS_2_3_4', 'xerces2-j-Xerces-J_2_3_0']
ant-ivy-2.0.0-alpha2
ant-rel-1.8.3
cassandra-cassandra-1.0.0
elasticsearch-v0.19.0
hadoop-release-0.6.0
hive-release-0.9.0
hsqldb-2.2.8
karaf-karaf-2.3.0
lucene-releases-lucene-solr-3.6.0
manifold-cf-release-0.6
nutch-release-1.4
pig-release-0.8.0
qpid-0.14
struts-STRUTS_2_3_4
xerces2-j-Xerces-J_2_3_0


In [5]:
df

Unnamed: 0,Project_name,Component,CDSBP,CC,LC,LZC,RB,SC
0,ant-ivy-2.0.0-alpha2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,0
1,ant-ivy-2.0.0-alpha2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,0
2,ant-ivy-2.0.0-alpha2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,0
3,ant-ivy-2.0.0-alpha2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,0
4,ant-ivy-2.0.0-alpha2,/*\n * Licensed to the Apache Software Founda...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
8139,xerces2-j-Xerces-J_2_3_0,"/*\n * The Apache Software License, Version 1....",0,0,0,0,0,0
8140,xerces2-j-Xerces-J_2_3_0,// XMLFilterImpl.java - base SAX2 filter imple...,0,0,0,0,0,0
8141,xerces2-j-Xerces-J_2_3_0,// XMLReaderAdapter.java - adapt an SAX2 XMLRe...,0,0,0,0,0,0
8142,xerces2-j-Xerces-J_2_3_0,// XMLReaderFactory.java - factory for creatin...,0,0,0,0,0,1


In [6]:
# Dizionario per salvare le predizioni
predictions = {}

# Dizionario per salvare le metriche
metrics = {}

# Loop attraverso i progetti
for project in projects:
    print("Current project:", project)

    # Caricamento dei dati specifici del progetto
    project_df = df[df['Project_name'] == project]

    # Inizializza il dizionario per il progetto corrente
    predictions[project] = {}
    metrics[project] = {}

    # Loop attraverso i code smell
    for smell in code_smells:
        print("Current code smell:", smell)

        X = project_df["Component"]
        y = project_df[str(smell)]

        if len(project_df) == 0:
            continue

        # Effettua lo split in train e test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

        # Definisci il classificatore
        classifier = MLPClassifier(verbose=False, hidden_layer_sizes=[10, 20, 20],activation='logistic')

        # Crea il pipeline
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('classifier', classifier),
        ], verbose=False)

        # Addestra il modello
        pipeline.fit(X_train, y_train)

        # Valuta il modello
        y_pred = pipeline.predict(X_test)

        # Salva le predizioni per lo smell corrente nel progetto corrente
        predictions[project][smell] = y_pred

        precision = precision_score(y_test, y_pred, average="macro")
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred, average="macro")
        f1 = f1_score(y_test, y_pred, average="macro")
        mcc = matthews_corrcoef(y_test, y_pred)

        # Salva le metriche per lo smell corrente nel progetto corrente
        metrics[project][smell] = {
            "Precision": precision,
            "Accuracy": accuracy,
            "Recall": recall,
            "F1": f1,   # Aggiungi l'F1 score al dizionario delle metriche
            "MCC": mcc  # Aggiungi l'MCC al dizionario delle metriche
        }

        print("Accuracy:", accuracy)
        print("Recall:", recall)
        print("Precision:", precision)
        print("F1:", f1)
        print("MCC:", mcc)
        print("--------------")

Current project: ant-ivy-2.0.0-alpha2
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.963302752293578
Recall: 0.5
Precision: 0.481651376146789
F1: 0.49065420560747663
MCC: 0.0
--------------
Current code smell: CC




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9908256880733946
Recall: 0.5
Precision: 0.4954128440366973
F1: 0.4976958525345622
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9908256880733946
Recall: 0.5
Precision: 0.4954128440366973
F1: 0.4976958525345622
MCC: 0.0
--------------
Current code smell: LZC
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: RB




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: SC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8715596330275229
Recall: 0.5
Precision: 0.43577981651376146
F1: 0.46568627450980393
MCC: 0.0
--------------
Current project: ant-rel-1.8.3
Current code smell: CDSBP




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 1.0
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9847328244274809
Recall: 0.5
Precision: 0.49236641221374045
F1: 0.49615384615384617
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9618320610687023
Recall: 0.5
Precision: 0.48091603053435117
F1: 0.49027237354085607
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9618320610687023
Recall: 0.5
Precision: 0.48091603053435117
F1: 0.49027237354085607
MCC: 0.0
--------------
Current code smell: LZC
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: RB




Accuracy: 0.9961832061068703
Recall: 0.9782608695652174
Precision: 0.9979166666666667
F1: 0.987845047552772
MCC: 0.9759796267771463
--------------
Current code smell: SC




Accuracy: 0.9274809160305344
Recall: 0.9216768064005334
Precision: 0.9142202210397052
F1: 0.9177788054968288
MCC: 0.8358637686957031
--------------
Current project: cassandra-cassandra-1.0.0
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8051282051282052
Recall: 0.5
Precision: 0.4025641025641026
F1: 0.4460227272727273
MCC: 0.0
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9948717948717949
Recall: 0.5
Precision: 0.49743589743589745
F1: 0.4987146529562982
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9846153846153847
Recall: 0.5
Precision: 0.49230769230769234
F1: 0.49612403100775193
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9846153846153847
Recall: 0.5
Precision: 0.49230769230769234
F1: 0.49612403100775193
MCC: 0.0
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9794871794871794
Recall: 0.5
Precision: 0.4897435897435897
F1: 0.4948186528497409
MCC: 0.0
--------------
Current code smell: RB




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: SC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9692307692307692
Recall: 0.5
Precision: 0.4846153846153846
F1: 0.49218749999999994
MCC: 0.0
--------------
Current project: elasticsearch-v0.19.0
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9886845827439887
Recall: 0.5
Precision: 0.49434229137199437
F1: 0.4971550497866287
MCC: 0.0
--------------
Current code smell: CC
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9971711456859972
Recall: 0.5
Precision: 0.4985855728429986
F1: 0.49929178470254953
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9971711456859972
Recall: 0.5
Precision: 0.4985855728429986
F1: 0.49929178470254953
MCC: 0.0
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9900990099009901
Recall: 0.5
Precision: 0.49504950495049505
F1: 0.4975124378109453
MCC: 0.0
--------------
Current code smell: RB
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: SC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9801980198019802
Recall: 0.5
Precision: 0.4900990099009901
F1: 0.495
MCC: 0.0
--------------
Current project: hadoop-release-0.6.0
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9029126213592233
Recall: 0.5
Precision: 0.45145631067961167
F1: 0.4744897959183674
MCC: 0.0
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9902912621359223
Recall: 0.5
Precision: 0.49514563106796117
F1: 0.4975609756097561
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9805825242718447
Recall: 0.5
Precision: 0.49029126213592233
F1: 0.4950980392156863
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9805825242718447
Recall: 0.5
Precision: 0.49029126213592233
F1: 0.4950980392156863
MCC: 0.0
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9320388349514563
Recall: 0.5
Precision: 0.46601941747572817
F1: 0.4824120603015076
MCC: 0.0
--------------
Current code smell: RB




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: SC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8155339805825242
Recall: 0.5
Precision: 0.4077669902912621
F1: 0.4491978609625668
MCC: 0.0
--------------
Current project: hive-release-0.9.0
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: hsqldb-2.2.8
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9361702127659575
Recall: 0.5
Precision: 0.46808510638297873
F1: 0.4835164835164835
MCC: 0.0
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9547872340425532
Recall: 0.5
Precision: 0.4773936170212766
F1: 0.4884353741496599
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9281914893617021
Recall: 0.5
Precision: 0.4640957446808511
F1: 0.48137931034482756
MCC: 0.0
--------------
Current code smell: LC




Accuracy: 0.9122340425531915
Recall: 0.781863525416534
Precision: 0.6976190476190476
F1: 0.7296437520427062
MCC: 0.47202373436666367
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9521276595744681
Recall: 0.5
Precision: 0.47606382978723405
F1: 0.4877384196185286
MCC: 0.0
--------------
Current code smell: RB


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8856382978723404
Recall: 0.5
Precision: 0.4428191489361702
F1: 0.4696755994358251
MCC: 0.0
--------------
Current code smell: SC




Accuracy: 0.925531914893617
Recall: 0.8839857651245552
Precision: 0.9144127754194462
F1: 0.8977466977466977
MCC: 0.7978185423937614
--------------
Current project: karaf-karaf-2.3.0
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: lucene-releases-lucene-solr-3.6.0
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: manifold-cf-release-0.6
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: nutch-release-1.4
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.7808219178082192
Recall: 0.5
Precision: 0.3904109589041096
F1: 0.43846153846153846
MCC: 0.0
--------------
Current code smell: CC




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: LC




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: LC




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9178082191780822
Recall: 0.5
Precision: 0.4589041095890411
F1: 0.47857142857142854
MCC: 0.0
--------------
Current code smell: RB


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9863013698630136
Recall: 0.5
Precision: 0.4931506849315068
F1: 0.496551724137931
MCC: 0.0
--------------
Current code smell: SC




Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current project: pig-release-0.8.0
Current code smell: CDSBP


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9420849420849421
Recall: 0.5
Precision: 0.47104247104247104
F1: 0.485089463220676
MCC: 0.0
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9691119691119691
Recall: 0.5
Precision: 0.48455598455598453
F1: 0.492156862745098
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9961389961389961
Recall: 0.5
Precision: 0.4980694980694981
F1: 0.4990328820116054
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9961389961389961
Recall: 0.5
Precision: 0.4980694980694981
F1: 0.4990328820116054
MCC: 0.0
--------------
Current code smell: LZC
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
MCC: 0.0
--------------
Current code smell: RB


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.972972972972973
Recall: 0.5
Precision: 0.4864864864864865
F1: 0.4931506849315069
MCC: 0.0
--------------
Current code smell: SC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9498069498069498
Recall: 0.5
Precision: 0.4749034749034749
F1: 0.4871287128712872
MCC: 0.0
--------------
Current project: qpid-0.14
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: struts-STRUTS_2_3_4
Current code smell: CDSBP
Current code smell: CC
Current code smell: LC
Current code smell: LC
Current code smell: LZC
Current code smell: RB
Current code smell: SC
Current project: xerces2-j-Xerces-J_2_3_0
Current code smell: CDSBP




Accuracy: 0.9942196531791907
Recall: 0.9964285714285714
Precision: 0.9852941176470589
F1: 0.9907451987374953
MCC: 0.9816595449466552
--------------
Current code smell: CC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.930635838150289
Recall: 0.5
Precision: 0.4653179190751445
F1: 0.4820359281437126
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.976878612716763
Recall: 0.5
Precision: 0.4884393063583815
F1: 0.49415204678362573
MCC: 0.0
--------------
Current code smell: LC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.976878612716763
Recall: 0.5
Precision: 0.4884393063583815
F1: 0.49415204678362573
MCC: 0.0
--------------
Current code smell: LZC


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9913294797687862
Recall: 0.5
Precision: 0.4956647398843931
F1: 0.49782293178519593
MCC: 0.0
--------------
Current code smell: RB


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9393063583815029
Recall: 0.5
Precision: 0.46965317919075145
F1: 0.4843517138599106
MCC: 0.0
--------------
Current code smell: SC




Accuracy: 0.9624277456647399
Recall: 0.5
Precision: 0.48121387283236994
F1: 0.49042709867452133
MCC: 0.0
--------------


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Stampa le predizioni per ogni progetto e smell
for project, smell_predictions in predictions.items():
    print("Project:", project)
    for smell, pred_list in smell_predictions.items():
        print("Code Smell:", smell)
        print(pred_list)
        print("--------------")


Project: ant-ivy-2.0.0-alpha2
Code Smell: CDSBP
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
--------------
Code Smell: CC
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
--------------
Code Smell: LC
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
--------------
Code Smell: LZC
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
# Stampa le metriche per ogni progetto e smell
for project, smell_metrics in metrics.items():
    print("Project:", project)
    for smell, metric_values in smell_metrics.items():
        print("Code Smell:", smell)
        print(metric_values)
        print("--------------")

In [14]:
metrics['pig-release-0.8.0']

{'CDSBP': {'Precision': 0.47104247104247104,
  'Accuracy': 0.9420849420849421,
  'Recall': 0.5,
  'F1': 0.485089463220676,
  'MCC': 0.0},
 'CC': {'Precision': 0.48455598455598453,
  'Accuracy': 0.9691119691119691,
  'Recall': 0.5,
  'F1': 0.492156862745098,
  'MCC': 0.0},
 'LC': {'Precision': 0.4980694980694981,
  'Accuracy': 0.9961389961389961,
  'Recall': 0.5,
  'F1': 0.4990328820116054,
  'MCC': 0.0},
 'LZC': {'Precision': 1.0,
  'Accuracy': 1.0,
  'Recall': 1.0,
  'F1': 1.0,
  'MCC': 0.0},
 'RB': {'Precision': 0.4864864864864865,
  'Accuracy': 0.972972972972973,
  'Recall': 0.5,
  'F1': 0.4931506849315069,
  'MCC': 0.0},
 'SC': {'Precision': 0.4749034749034749,
  'Accuracy': 0.9498069498069498,
  'Recall': 0.5,
  'F1': 0.4871287128712872,
  'MCC': 0.0}}

#### Il CountVectorizer genera una rappresentazione vettoriale in cui ogni parola unica diventa una feature e la frequenza di ciascuna parola viene conteggiata. Il TfidfTransformer, invece, calcola la frequenza termine-frequenza inversa del documento (TF-IDF) delle feature estratte dal CountVectorizer. Il TF-IDF è una misura statistica che tiene conto sia della frequenza della parola in un documento (TF) che della sua rarità nell'intero corpus (IDF). Questo aiuta a pesare le parole più significative rispetto a quelle comuni. Il TfidfTransformer moltiplica la matrice delle frequenze delle parole per il peso TF-IDF per ottenere una rappresentazione vettoriale ponderata.