In [2]:
import pandas as pd
import numpy as np
import ast
import re
import chardet

%run "./support_functions.ipynb"

file_path = "./BPMN 2.csv"
output_file_path = "./BPMNcleaned.csv"

# vedo l'encoding del file
input_file_encoding = get_file_encoding(file_path)
print(f"L'encoding del file '{file_path}' è: {input_file_encoding}")

# Save the data of the .csv file in a Variable
df = pd.read_csv(file_path, sep=';', engine='python', encoding=input_file_encoding)

# Drop the specified columns
df = df.iloc[:, [0,-2]]

# Save cleaned data in a new .csv file
df.to_csv(output_file_path, index=False)

# Remove duplicates in the DataFrame
df = df.drop_duplicates()

# Clean the column from "^^^"
df = df.fillna('')
df = df.applymap(lambda x: x.split("^^^") if isinstance(x, str) else x)

# Save the file .csv
df.to_csv(output_file_path, index=False, sep=';')

df.info()
df.head(15)
 

L'encoding del file './BPMN 2.csv' è: utf-8
<class 'pandas.core.frame.DataFrame'>
Index: 18247 entries, 0 to 22575
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CollectionName  18247 non-null  object
 1   Labels          18247 non-null  object
dtypes: object(2)
memory usage: 427.7+ KB


Unnamed: 0,CollectionName,Labels
0,[BIT],"[s00001177, s00001177, s00001430, s00001430, s..."
1,[BIT],"[s00000262, s00000266, s00000267, s00000268, s..."
2,[BIT],"[s00000309, s00000310, s00000311, s00000312, s..."
3,[BIT],"[s00000336, s00000337, s00000338, s00000339]"
4,[BIT],"[s00000256, s00000307, s00000335, s00000358, s..."
5,[BIT],"[s00000843, s00000843, s00001196, s00001196, s..."
6,[BIT],"[s00000520, s00000527]"
7,[BIT],"[s00001294, s00001294, s00001355, s00001355, s..."
8,[BIT],"[s00000409, s00000598, s00000599, s00000600, s..."
9,[BIT],"[s00000527, s00000621, s00000623, s00000624, s..."


In [3]:
import ast
import chardet
import pandas as pd
from ast import literal_eval
from langdetect import detect

%run "./support_functions.ipynb"

input_file_path = './BPMNcleaned.csv'
output_file_path = './BPMNcleanedlanguages.csv'


# vedo l'encoding del file
input_file_encoding = get_file_encoding(input_file_path)
print(f"L'encoding del file '{input_file_path}' è: {input_file_encoding}")

# leggo il file CSV in un DataFrame
df = pd.read_csv(input_file_path, sep=';', engine='python', encoding=input_file_encoding)

# imposto la lingua che vogliamo mantenere e ritorno se la stringa appartiene 
def is_desired_language(text):
    target_language='en'
    try:
        detected_language = detect(text)
        return detected_language == target_language
    except:
        return False
    
# metodo che cicla ogni etichetta
def is_desired_language_list(labels):
    return any(is_desired_language(str(label)) for label in labels)

# filtro ogni riga del dataframe e successivamente pulisco le righe non consone
def filter_dataframe(df):
    df['Labels'] = df['Labels'].apply(lambda labels: literal_eval(labels))
    df['is_desired_language'] = df['Labels'].apply(is_desired_language_list)
        
    
    discarded_rows = df[~df['is_desired_language']]
    filtered_df = df[df['is_desired_language']].drop(['is_desired_language'], axis=1)

    # stampo le righe eliminate
    print("Righe eliminate:")
    print(discarded_rows)

    return filtered_df

# applico il filtro al DataFrame
filtered_df = filter_dataframe(df)

# salvo il DataFrame risultante in un nuovo file CSV
filtered_df.to_csv(output_file_path, index=False, sep=';')


L'encoding del file './BPMNcleaned.csv' è: utf-8
Righe eliminate:
      CollectionName                                             Labels  \
777        ['BPMAI']  [ArztuntersuchtnachVollst?хndigkeit, Gewebewir...   
781        ['BPMAI']  [ANO, ANO, DefaultInputSet, DefaultInputSet, D...   
782        ['BPMAI']  [Datenbereitsvorhanden?, Ja, Ja, Ja, Kontendat...   
783        ['BPMAI']       [A, B, B, DefaultInputSet, DefaultOutputSet]   
788        ['BPMAI']                             [Writefinancialreport]   
...              ...                                                ...   
18240        ['eCH']  [52PatentpflichtAnlassabkl?хren, Bewilligunger...   
18241        ['eCH']  [52PatentpflichtAnlassabkl?хren, AusnahmenzurA...   
18243        ['eCH']  [Antragauf?ҐnderungVernehmlassungsweglangstatt...   
18244        ['eCH']  [Abteilungsleiter, allesi.O., allesi.O., Anpas...   
18246        ['eCH']  [BesprechungmitBeteiligten, Bewilligungerteile...   

       is_desired_language  
777 

In [4]:
import ast
import chardet
import pandas as pd

%run "./support_functions.ipynb"

input_file_path = './BPMNcleanedlanguages.csv'
output_training_file = './BPMNtraining.csv'
output_testing_file = './BPMNtesting.csv'

# vedo l'encoding del file
input_file_encoding = get_file_encoding(input_file_path)
print(f"L'encoding del file '{input_file_path}' è: {input_file_encoding}")

# Loading the DataFrame
df = pd.read_csv(input_file_path, sep=';', engine='python', encoding=input_file_encoding)

# Funzione che divide il DataFrame in proporzioni
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# prende randomicamente e in percentuale le righe del df per training e testing
training_data, testing_data = split_train_test(df, 0.3)

# Stampo le prime 20 righe di training e testing
print(testing_data.head())
print(training_data.head())

# salva in file rispettivi per training e testing
training_data.to_csv(output_training_file, index=False, sep=';')
testing_data.to_csv(output_testing_file, index=False, sep=';')

L'encoding del file './BPMNcleanedlanguages.csv' è: utf-8
      CollectionName                                             Labels
11088    ['Camunda']  ['Bookpayment', 'Caseclosed', 'Caseclosed', 'C...
4953       ['BPMAI']  ['Cloturerledossier', 'demandedemodification',...
2161       ['BPMAI']  ['A', 'A', 'B', 'B', 'C', 'D', 'E', 'F', 'G', ...
4397       ['BPMAI']  ['Assemblemeals', 'AssignWorkEffort', 'Collect...
3863       ['BPMAI']  ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', ...
     CollectionName                                             Labels
7385      ['BPMAI']  ['Allgemeinuntersuchung', 'AngeboteinesTermins...
9477      ['BPMAI']  ['AddLarozepam0,5mgpo/sublinguallyPRN', 'Alter...
8526      ['BPMAI']  ['A', 'AA', 'AB', 'AB', 'AC', 'AC', 'AD', 'AD'...
1594      ['BPMAI']  ['A', 'AA', 'AB', 'AD', 'AE', 'AF', 'AF', 'AF'...
693         ['BIT']  ['s00001147', 's00001147', 's00001147', 's0000...
