In [7]:
import pandas as pd
import numpy as np
import ast
import re
import chardet

%run "./support_functions.ipynb"

file_path = "./BPMN 2.csv"
output_file_path = "./BPMNcleaned.csv"

# vedo l'encoding del file
input_file_encoding = get_file_encoding(file_path)
print(f"L'encoding del file '{file_path}' è: {input_file_encoding}")

# Save the data of the .csv file in a Variable
df = pd.read_csv(file_path, sep=';', engine='python', encoding=input_file_encoding)

# Drop the specified columns
df = df.iloc[:, [0,-2]]

# Save cleaned data in a new .csv file
df.to_csv(output_file_path, index=False)

# Remove duplicates in the DataFrame
df = df.drop_duplicates()

# Clean the column from "^^^"
df = df.fillna('')
df = df.applymap(lambda x: x.split("^^^") if isinstance(x, str) else x)

# Save the file .csv
df.to_csv(output_file_path, index=False, sep=';')

df.info()
df.head(15)
 

L'encoding del file './BPMN 2.csv' è: utf-8
<class 'pandas.core.frame.DataFrame'>
Index: 18247 entries, 0 to 22575
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CollectionName  18247 non-null  object
 1   Labels          18247 non-null  object
dtypes: object(2)
memory usage: 427.7+ KB


Unnamed: 0,CollectionName,Labels
0,[BIT],"[s00001177, s00001177, s00001430, s00001430, s..."
1,[BIT],"[s00000262, s00000266, s00000267, s00000268, s..."
2,[BIT],"[s00000309, s00000310, s00000311, s00000312, s..."
3,[BIT],"[s00000336, s00000337, s00000338, s00000339]"
4,[BIT],"[s00000256, s00000307, s00000335, s00000358, s..."
5,[BIT],"[s00000843, s00000843, s00001196, s00001196, s..."
6,[BIT],"[s00000520, s00000527]"
7,[BIT],"[s00001294, s00001294, s00001355, s00001355, s..."
8,[BIT],"[s00000409, s00000598, s00000599, s00000600, s..."
9,[BIT],"[s00000527, s00000621, s00000623, s00000624, s..."


In [8]:
import ast
import chardet
import pandas as pd
from ast import literal_eval
from langdetect import detect

%run "./support_functions.ipynb"

input_file_path = './BPMNcleaned.csv'
output_file_path = './BPMNcleanedlanguages.csv'


# vedo l'encoding del file
input_file_encoding = get_file_encoding(input_file_path)
print(f"L'encoding del file '{input_file_path}' è: {input_file_encoding}.")

# leggo il file CSV in un DataFrame
df = pd.read_csv(input_file_path, sep=';', engine='python', encoding=input_file_encoding)

# imposto la lingua che vogliamo mantenere e ritorno se la stringa appartiene 
def is_desired_language(text):
    target_language='en'
    try:
        detected_language = detect(text)
        return detected_language == target_language
    except:
        return False
    
# metodo che cicla ogni etichetta
def is_desired_language_list(labels):
    return any(is_desired_language(str(label)) for label in labels)

# filtro ogni riga del dataframe e successivamente pulisco le righe non consone
def filter_dataframe(df):
    df['Labels'] = df['Labels'].apply(lambda labels: literal_eval(labels))
    df['is_desired_language'] = df['Labels'].apply(is_desired_language_list)
        
    discarded_rows = df[~df['is_desired_language']]
    filtered_df = df[df['is_desired_language']].drop(['is_desired_language'], axis=1)

    
    # stampo le righe eliminate
    print(f"Sono state eliminate {len(discarded_rows)} righe.")
    print(f"Righe eliminate: /n {discarded_rows}")
    #print(discarded_rows)

    return filtered_df

# applico il filtro al DataFrame
filtered_df = filter_dataframe(df)

# salvo il DataFrame risultante in un nuovo file CSV
filtered_df.to_csv(output_file_path, index=False, sep=';')


L'encoding del file './BPMNcleaned.csv' è: utf-8
Sono state eliminate 5796
Righe eliminate: /n       CollectionName                                             Labels  \
777        ['BPMAI']  [ArztuntersuchtnachVollst?хndigkeit, Gewebewir...   
781        ['BPMAI']  [ANO, ANO, DefaultInputSet, DefaultInputSet, D...   
782        ['BPMAI']  [Datenbereitsvorhanden?, Ja, Ja, Ja, Kontendat...   
783        ['BPMAI']       [A, B, B, DefaultInputSet, DefaultOutputSet]   
788        ['BPMAI']                             [Writefinancialreport]   
...              ...                                                ...   
18241        ['eCH']  [52PatentpflichtAnlassabkl?хren, AusnahmenzurA...   
18243        ['eCH']  [Antragauf?ҐnderungVernehmlassungsweglangstatt...   
18244        ['eCH']  [Abteilungsleiter, allesi.O., allesi.O., Anpas...   
18245        ['eCH']  [Abschlagszahlungm??glich, Abschlagszahlungm??...   
18246        ['eCH']  [BesprechungmitBeteiligten, Bewilligungerteile...   

    

In [20]:
import ast
import chardet
import pandas as pd

%run "./support_functions.ipynb"

input_file_path = './BPMNcleanedlanguages.csv'
output_training_file = './BPMNtraining.csv'
output_testing_file = './BPMNtesting.csv'

# vedo l'encoding del file
input_file_encoding = get_file_encoding(input_file_path)
print(f"L'encoding del file '{input_file_path}' è: {input_file_encoding}")

# Loading the DataFrame
df = pd.read_csv(input_file_path, sep=';', engine='python', encoding=input_file_encoding)

# Funzione che divide il DataFrame in proporzion
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# Funzione che da un dataframe ritorna un dataframe di dataframe divisi
def split_by_collectionname(data):
    divided_data = {}
    for collectionname in data['CollectionName'].unique():
        divided_data[collectionname] = data[data['CollectionName'] == collectionname].copy()
    return divided_data

# Funzione che per ogni chiave (collectionname) divide in train e testing
def creating_train_test_dataframe(data, test_ratio):
    training_df = []
    testing_df = []
    # Dividi il DataFrame per CollectionName
    divided_data = split_by_collectionname(data)
    # Per ogni CollectionName, dividi il DataFrame in train e test
    for collectionname, data in divided_data.items():
        train_df, test_df = split_train_test(data, test_ratio)
        training_df.append(train_df)
        testing_df.append(test_df)
        
        training_data=pd.concat(training_df)
        testing_data=pd.concat(testing_df) 
    
    return training_data, testing_data  
        
# Prende randomicamente e in percentuale le righe del df per training e testing
# training_data, testing_data = split_train_test(df, 0.3)
training_data, testing_data = creating_train_test_dataframe(df, 0.3)


# Stampo le prime 20 righe di training e testing
# Stampa il numero dei diversi domini nel DataFrame di testing
print("Numero di diversi domini nel testing_data:")
print(testing_data['CollectionName'].value_counts())
print(testing_data.head())

# Stampa il numero dei diversi domini nel DataFrame di training
print("\nNumero di diversi domini nel training_data:")
print(training_data['CollectionName'].value_counts())
print(training_data.head())

# salva in file rispettivi per training e testing
training_data.to_csv(output_training_file, index=False, sep=';')
testing_data.to_csv(output_testing_file, index=False, sep=';')

L'encoding del file './BPMNcleanedlanguages.csv' è: utf-8
Numero di diversi domini nel testing_data:
CollectionName
['BPMAI']      3015
['Camunda']     481
['BIT']         233
['eCH']           6
Name: count, dtype: int64
    CollectionName                                             Labels
552        ['BIT']  ['s00001950', 's00001950', 's00004213', 's0000...
331        ['BIT']  ['s00002058', 's00002058', 's00002058', 's0000...
161        ['BIT']  ['s00001789', 's00001789', 's00001789', 's0000...
60         ['BIT']  ['s00001130', 's00001130', 's00001130', 's0000...
477        ['BIT']  ['s00001397', 's00001397', 's00001397', 's0000...

Numero di diversi domini nel training_data:
CollectionName
['BPMAI']      7035
['Camunda']    1123
['BIT']         544
['eCH']          14
Name: count, dtype: int64
    CollectionName                                             Labels
520        ['BIT']  ['s00003750', 's00003750', 's00003750', 's0000...
400        ['BIT']                         ['s000025