In [1]:
from pandas import read_csv

from sberpm import DataHolder
from sberpm.ml.factor_analysis import FactorAnalysis
from sberpm.ml.factor_analysis._wrapper_factor_analysis import WrapperFactorAnalysis

#### FactorAnalysis from tutorial

In [2]:
path = "InternationalDeclarations.csv"

data_holder = DataHolder(
        data=path,
        id_column="id",
        activity_column="concept:name",
        start_timestamp_column="time:timestamp",
        time_format="%Y-%m-%d %H:%M:%S",
        utc=True,
    )

In [3]:
fa_inst = FactorAnalysis(
    data_holder=data_holder,
    target_column='case:AdjustedAmount',
    type_of_target="number",
    categorical_cols=['org:resource', 'case:Permit ActivityNumber','org:role'],
    numeric_cols=['case:Amount','case:RequestedAmount','case:Permit RequestedBudget'],
    date_cols=['time:timestamp'],
    extended_search=True,
    count_others=True,
)

In [4]:
%%time

fa_inst.apply()

Too many unique values in case:Permit ActivityNumber
CPU times: user 3.7 s, sys: 3.02 s, total: 6.72 s
Wall time: 6.32 s


Unnamed: 0,org:resource,case:Permit ActivityNumber,org:role,case:Amount,case:RequestedAmount,case:Permit RequestedBudget,time:timestamp,Прочее
results_extended,0.000152,0.032259,0.12237,60.048425,37.767834,2.010445,0.018238,0.000277


#### FactorAnalysis with parameters and data from tutorial

In [5]:
from pandas import to_datetime

df = read_csv("InternationalDeclarations.csv")

time_column = "time:timestamp"
time_format = "%Y-%m-%d %H:%M:%S"
df[time_column] = to_datetime(df[time_column], format=time_format, errors="raise", utc=True)

df.head(2)

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Permit travel permit number,case:DeclarationNumber,case:Amount,case:RequestedAmount,case:Permit TaskNumber,...,case:concept:name,case:Permit OrganizationalEntity,case:travel permit number,case:Permit RequestedBudget,case:id,case:Permit ID,case:Permit id,case:BudgetNumber,case:Permit ActivityNumber,case:AdjustedAmount
0,rv_travel permit 76455_6,STAFF MEMBER,Start trip,2016-10-04 22:00:00+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
1,rv_travel permit 76455_7,STAFF MEMBER,End trip,2016-10-04 22:00:00+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561


In [6]:
factors = WrapperFactorAnalysis(
    data=df,
    model_params={
        "notation_params": dict(
            id_col="id",
            status_col="concept:name",
            date_col="time:timestamp",
            date_end_col=None,
        ),
        "model_params": dict(
            target_column='case:AdjustedAmount',
            type_of_target="number",
            categorical_cols=['org:resource', 'case:Permit ActivityNumber','org:role'],
            numeric_cols=['case:Amount','case:RequestedAmount','case:Permit RequestedBudget'],
            date_cols=['time:timestamp'],
            extended_search=True,
            count_others=True,
        ),
    },
)

DataHolder: 'time_format' is not set, recommended to specify it for correct time conversion, e.g., time_format='%d-%m-%Y %H:%M:%S'
DataHolder: timestamp auto conversion will be done. 'dayfirst' is not set, in ambiguous cases it will be considered as False.
DataHolder: timestamp auto conversion will be done. 'yearfirst' is not set, in ambiguous cases it will be considered as False.


In [7]:
%%time

factors_output = factors.run_model()

Too many unique values in case:Permit ActivityNumber
CPU times: user 3.84 s, sys: 2.8 s, total: 6.64 s
Wall time: 4.83 s


In [8]:
factors_output

Unnamed: 0,org:resource,case:Permit ActivityNumber,org:role,case:Amount,case:RequestedAmount,case:Permit RequestedBudget,time:timestamp,Прочее
results_extended,152,32259,12237,60041421,37774838,2010445,18238,277
