# Activity Type Model (ATM)

This is the first part of the Activity Scheduler model. In our analysis, we will be using two input files: `activity.csv` and `person.csv`. The original dataset we are using is California household travel survey 2012.

## activity.csv
This file will have at least the following two columns: 

- **activity type**

- **activity end time**

## person.csv
This file will have at least the following two columns:

- **employment**

- **X_student**

Our goal is to use the information from these files to build a decision tree model that can predict the next activity type based on the previous activity type, end time, and personal attributes such as employment status and whether the person is a student.


In [13]:
# ATM implementation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [14]:
act = pd.read_csv('seq_more_than_one.csv') 
df = pd.read_csv('Train_Test_sampleID.csv')
person = pd.read_csv('survey_person.csv')
train_id = df['Train_id']
test_id = df['Test_id']

In [15]:
test_id = test_id.dropna()
df_test = act[act['id'].isin(test_id)]


In [16]:
df_test

Unnamed: 0,id,purpose,start_time,end_time
0,10319851,"[1, 25, 1]","[12, 36, 52]","[32, 51, 68]"
1,10319852,"[1, 21, 1]","[12, 37, 43]","[37, 42, 50]"
5,10320364,"[1, 22, 1]","[12, 34, 63]","[33, 62, 11]"
7,10320531,"[1, 17, 19, 15, 1, 24, 1]","[12, 39, 41, 45, 49, 73, 81]","[38, 40, 44, 49, 69, 80, 11]"
10,10320534,"[1, 24, 1]","[12, 73, 81]","[72, 80, 11]"
...,...,...,...,...
60731,72116904,"[1, 2, 1]","[12, 27, 65]","[26, 62, 11]"
60733,72117981,"[1, 25, 1]","[12, 66, 78]","[60, 72, 11]"
60738,72118624,"[1, 23, 1]","[12, 67, 80]","[62, 77, 0]"
60739,72119121,"[1, 23, 1]","[12, 39, 46]","[38, 44, 70]"


In [17]:
import ast

# 假设原始的DataFrame叫做df
df = df_test
# 使用 `ast.literal_eval` 将字符串形式的列表转换回真正的列表
df['purpose'] = df['purpose'].apply(ast.literal_eval)
df['start_time'] = df['start_time'].apply(ast.literal_eval)
df['end_time'] = df['end_time'].apply(ast.literal_eval)

# 对于每一行，将 'purpose', 'start_time', 'end_time' 合并成一个列表
df['activity'] = df.apply(lambda row: [[purpose, start_time, end_time] for purpose, start_time, end_time in zip(row['purpose'], row['start_time'], row['end_time'])], axis=1)

def adjust_dep_time(activity_list):
    return [[purpose, start_time, end_time+96] if end_time < start_time else [purpose, start_time, end_time] for purpose, start_time, end_time in activity_list]

# Apply function to each row in the DataFrame
df['activity'] = df['activity'].apply(adjust_dep_time)

# 删除原始的 'purpose', 'start_time', 'end_time' 列
df = df.drop(['purpose', 'start_time', 'end_time'], axis=1)

# 打印出新的DataFrame
print(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['purpose'] = df['purpose'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_time'] = df['start_time'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_time'] = df['end_time'].apply(ast.literal_eval)


             id                                           activity
0      10319851           [[1, 12, 32], [25, 36, 51], [1, 52, 68]]
1      10319852           [[1, 12, 37], [21, 37, 42], [1, 43, 50]]
5      10320364          [[1, 12, 33], [22, 34, 62], [1, 63, 107]]
7      10320531  [[1, 12, 38], [17, 39, 40], [19, 41, 44], [15,...
10     10320534          [[1, 12, 72], [24, 73, 80], [1, 81, 107]]
...         ...                                                ...
60731  72116904           [[1, 12, 26], [2, 27, 62], [1, 65, 107]]
60733  72117981          [[1, 12, 60], [25, 66, 72], [1, 78, 107]]
60738  72118624           [[1, 12, 62], [23, 67, 77], [1, 80, 96]]
60739  72119121           [[1, 12, 38], [23, 39, 44], [1, 46, 70]]
60746  72120271          [[1, 26, 34], [20, 35, 60], [1, 62, 107]]

[11489 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['activity'] = df.apply(lambda row: [[purpose, start_time, end_time] for purpose, start_time, end_time in zip(row['purpose'], row['start_time'], row['end_time'])], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['activity'] = df['activity'].apply(adjust_dep_time)


In [6]:
df.to_csv('ground_truth.csv',index = False)

In [19]:
print(df_test['purpose'].head())

0                    [1, 25, 1]
1                    [1, 21, 1]
5                    [1, 22, 1]
7     [1, 17, 19, 15, 1, 24, 1]
10                   [1, 24, 1]
Name: purpose, dtype: object


In [21]:
def adjust_dep_time(row):
    adjusted_end_time = []
    for end, start in zip(row['end_time'], row['start_time']):
        if end < start:
            adjusted_end_time.append(end + 96)
        else:
            adjusted_end_time.append(end)
    return adjusted_end_time

# Apply function to each row in the DataFrame
df['end_time'] = df.apply(adjust_dep_time, axis=1)

# 对于每一行，将 'purpose', 'start_time', 'end_time' 合并成一个列表
df['activity'] = df.apply(lambda row: [[purpose, start_time, end_time] for purpose, start_time, end_time in zip(row['purpose'], row['start_time'], row['end_time'])], axis=1)

# 删除原始的 'purpose', 'start_time', 'end_time' 列
df = df.drop(['purpose', 'start_time', 'end_time'], axis=1)

# 打印出新的DataFrame
print(df)

             id                                           activity
0      10319851           [[1, 12, 32], [25, 36, 51], [1, 52, 68]]
1      10319852           [[1, 12, 37], [21, 37, 42], [1, 43, 50]]
5      10320364          [[1, 12, 33], [22, 34, 62], [1, 63, 107]]
7      10320531  [[1, 12, 38], [17, 39, 40], [19, 41, 44], [15,...
10     10320534          [[1, 12, 72], [24, 73, 80], [1, 81, 107]]
...         ...                                                ...
60731  72116904           [[1, 12, 26], [2, 27, 62], [1, 65, 107]]
60733  72117981          [[1, 12, 60], [25, 66, 72], [1, 78, 107]]
60738  72118624           [[1, 12, 62], [23, 67, 77], [1, 80, 96]]
60739  72119121           [[1, 12, 38], [23, 39, 44], [1, 46, 70]]
60746  72120271          [[1, 26, 34], [20, 35, 60], [1, 62, 107]]

[11489 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_time'] = df.apply(adjust_dep_time, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['activity'] = df.apply(lambda row: [[purpose, start_time, end_time] for purpose, start_time, end_time in zip(row['purpose'], row['start_time'], row['end_time'])], axis=1)


In [22]:
test_id = test_id.dropna()
train_id = train_id.dropna()

# convert string to list
act['purpose'] = act['purpose'].apply(ast.literal_eval)
act['start_time'] = act['start_time'].apply(ast.literal_eval)
act['end_time'] = act['end_time'].apply(ast.literal_eval)

# reset index
act.reset_index(drop=True, inplace=True)

# explode all lists at once and reset the index again
act = act.explode(['purpose', 'start_time', 'end_time']).reset_index(drop=True)
df_test = act[act['id'].isin(test_id)]
df_train = act[act['id'].isin(train_id)]

In [23]:
df_test = pd.merge(df_test, person, on='id', how='left')
df_train = pd.merge(df_train, person, on='id', how='left')
df_test = df_test.rename(columns={'start_time': 'arr_time', 'end_time': 'dep_time'})
df_train = df_train.rename(columns={'start_time': 'arr_time', 'end_time': 'dep_time'})

In [24]:
def adjust_dep_time(row):
    if row['dep_time'] < row['arr_time']:
        return row['dep_time'] + 96
    else:
        return row['dep_time']

# Apply function to each row in the DataFrame
df_test['dep_time'] = df_test.apply(adjust_dep_time, axis=1)
df_train['dep_time'] = df_train.apply(adjust_dep_time, axis=1)

# Decision tree parameter and classification learning

In [25]:
# Sort the data
df_next = df_train.sort_values(['id', 'dep_time'])

# Shift the purpose column to get the next activity type
df_next['next_purpose'] = df_next.groupby('id')['purpose'].shift(-1)

# Fill NaN values with a special category indicating no further activity
df_next['next_purpose'].fillna('None', inplace=True)
df_next['next_purpose'] = df_next['next_purpose'].apply(lambda x: int(x) if x != 'None' else x)

def convert_next_purpose(value):
    if value == 'None':  # pandas uses `pd.isnull` to check for NaN values
        return 'None'
    elif value == 1:
        return 1
    elif value in [2, 3]:
        return '2&3'
    else:
        return 'others'

df_next['new_next_purpose'] = df_next['next_purpose'].apply(convert_next_purpose)

In [26]:
from scipy.stats import entropy

def calculate_entropy(df):
    categories = ['None', 1, '2&3', 'others']
    dist = df['new_next_purpose'].value_counts(normalize=True).reindex(categories, fill_value=1e-10)
    return entropy(dist)

def information_gain(df, T):
    # 计算原始信息熵
    original_entropy = calculate_entropy(df)
    
    # 数据划分
    group1 = df[df['dep_time'] <= T]
    group2 = df[df['dep_time'] > T]

    # 计算划分后的信息熵
    entropy1 = calculate_entropy(group1)
    entropy2 = calculate_entropy(group2)

    # 计算信息增益
    entropy_after = len(group1) / len(df) * entropy1 + len(group2) / len(df) * entropy2
    return original_entropy - entropy_after

In [27]:
def find_best_T(df, Ts, threshold=0.03):
    # 初始化最大信息增益和最佳T
    max_info_gain = -np.inf
    best_T = None

    # 对于每个可能的T，计算信息增益
    for T in Ts:
        info_gain = information_gain(df, T)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_T = T

    # 如果最大信息增益大于阈值，则继续划分数据
    if max_info_gain > threshold:
        print(f"Best T: {best_T}, Max Information Gain: {max_info_gain}")

        # 对数据进行划分
        df_1 = df[df['dep_time']<=best_T]
        df_2 = df[df['dep_time']>best_T]
        Ts_1 = Ts[Ts<=best_T]
        Ts_2 = Ts[Ts>best_T]

        # 递归地寻找下一层的最佳T
        find_best_T(df_1, Ts_1, threshold)
        find_best_T(df_2, Ts_2, threshold)

# 定义可能的T列表
Ts = 4 * np.arange(0, 24, 1)
t_end = 76

case1 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] == 1) & (df_next['purpose'] == 1)]
case2 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] == 1) & (df_next['purpose'] != 1)]
case3 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] == 1)& (df_next['purpose'] == 1)]
case4 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] == 1)& (df_next['purpose'] == 1)]
case5 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] != 1)& (df_next['purpose'] == 1)]
case6 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] != 1)& (df_next['purpose'] == 1)]

print('case1:')
find_best_T(case1, Ts)
print('case2:')
find_best_T(case2, Ts)
print('case3:')
find_best_T(case3, Ts)
print('case4:')
find_best_T(case4, Ts)
print('case5:')
find_best_T(case5, Ts)
print('case6:')
find_best_T(case6, Ts)

case1:
Best T: 36, Max Information Gain: 0.13454035470021541
Best T: 60, Max Information Gain: 0.07064041294625012
case2:
Best T: 56, Max Information Gain: 0.030190402368426628
case3:
Best T: 36, Max Information Gain: 0.18657311275945965
Best T: 64, Max Information Gain: 0.055847861721615266
case4:
Best T: 36, Max Information Gain: 0.18657311275945965
Best T: 64, Max Information Gain: 0.055847861721615266
case5:
Best T: 36, Max Information Gain: 0.11083419115476523
Best T: 32, Max Information Gain: 0.05132716001504545
Best T: 64, Max Information Gain: 0.05677498189283986
case6:
Best T: 36, Max Information Gain: 0.11083419115476523
Best T: 32, Max Information Gain: 0.05132716001504545
Best T: 64, Max Information Gain: 0.05677498189283986


In [28]:
# Define the groups
case1 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] == 1) & (df_next['purpose'] == 1)]
group1 = case1[(case1['dep_time']<=60)&(case1['dep_time']<=36)]
group2 = case1[(case1['dep_time']<=60)&(case1['dep_time']>36)]
group3 = case1[(case1['dep_time']>60)&(case1['dep_time']<=68)]
group4 = case1[(case1['dep_time']>60)&(case1['dep_time']>68)]

case2 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] == 1) & (df_next['purpose'] != 1)]
group5 = case2[(case2['dep_time']<=56)]
group6 = case2[(case2['dep_time']<=56)]

case3 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] == 1)& (df_next['purpose'] == 1)]
group7 = case3[(case3['dep_time']<=32)]
group8 = case3[(case3['dep_time']>32)&(case3['dep_time']<=56)]
group9 = case3[(case3['dep_time']>32)&(case3['dep_time']>56)]

case4 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] == 1)& (df_next['purpose'] == 1)]
group10 = case3[(case3['dep_time']<=32)]
group11 = case4[(case4['dep_time']>32)&(case4['dep_time']<=56)]
group12 = case4[(case4['dep_time']>32)&(case4['dep_time']>56)]

case5 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] != 1)& (df_next['purpose'] == 1)]
group13 = case5[(case5['dep_time']<=36)&(case5['dep_time']<=32)]
group14 = case5[(case5['dep_time']<=36)&(case5['dep_time']>32)]
group15 = case5[(case5['dep_time']>36)&(case5['dep_time']<=60)]
group16 = case5[(case5['dep_time']>36)&(case5['dep_time']>60)]

case6 = df_next[(df_next['dep_time'] < t_end) & (df_next['employment'] != 1) & (df_next['X_student'] != 1)& (df_next['purpose'] == 1)]
group17 = case6[(case6['dep_time']<=36)&(case6['dep_time']<=32)]
group18 = case6[(case6['dep_time']<=36)&(case6['dep_time']>32)]
group19 = case6[(case6['dep_time']>36)&(case6['dep_time']<=60)]
group20 = case6[(case6['dep_time']>36)&(case6['dep_time']>60)]

print("Group 1 next purpose distribution:\n", group1['next_purpose'].value_counts(normalize=True),"sample:",len(group1))
print("\nGroup 2 next purpose distribution:\n", group2['next_purpose'].value_counts(normalize=True),"sample:",len(group2))
print("\nGroup 3 next purpose distribution:\n", group3['next_purpose'].value_counts(normalize=True),"sample:",len(group3))
print("\nGroup 4 next purpose distribution:\n", group4['next_purpose'].value_counts(normalize=True),"sample:",len(group4))
print("\nGroup 5 next purpose distribution:\n", group5['next_purpose'].value_counts(normalize=True),"sample:",len(group5))
print("\nGroup 6 next purpose distribution:\n", group6['next_purpose'].value_counts(normalize=True),"sample:",len(group6))
print("\nGroup 7 next purpose distribution:\n", group7['next_purpose'].value_counts(normalize=True),"sample:",len(group7))
print("\nGroup 8 next purpose distribution:\n", group8['next_purpose'].value_counts(normalize=True),"sample:",len(group8))
print("\nGroup 9 next purpose distribution:\n", group9['next_purpose'].value_counts(normalize=True),"sample:",len(group9))
print("\nGroup 10 next purpose distribution:\n", group10['next_purpose'].value_counts(normalize=True),"sample:",len(group10))
print("\nGroup 11 next purpose distribution:\n", group11['next_purpose'].value_counts(normalize=True),"sample:",len(group11))
print("\nGroup 12 next purpose distribution:\n", group12['next_purpose'].value_counts(normalize=True),"sample:",len(group12))
print("\nGroup 13 next purpose distribution:\n", group13['next_purpose'].value_counts(normalize=True),"sample:",len(group13))
print("\nGroup 14 next purpose distribution:\n", group14['next_purpose'].value_counts(normalize=True),"sample:",len(group14))
print("\nGroup 15 next purpose distribution:\n", group15['next_purpose'].value_counts(normalize=True),"sample:",len(group15))
print("\nGroup 16 next purpose distribution:\n", group16['next_purpose'].value_counts(normalize=True),"sample:",len(group16))
print("\nGroup 17 next purpose distribution:\n", group17['next_purpose'].value_counts(normalize=True),"sample:",len(group17))
print("\nGroup 18 next purpose distribution:\n", group18['next_purpose'].value_counts(normalize=True),"sample:",len(group18))
print("\nGroup 19 next purpose distribution:\n", group19['next_purpose'].value_counts(normalize=True),"sample:",len(group19))

Group 1 next purpose distribution:
 2       0.577248
1       0.153058
15      0.034326
21      0.027736
22      0.025999
25      0.025939
20      0.024561
19      0.020488
23      0.018810
3       0.014857
18      0.013658
14      0.011801
17      0.011622
12      0.010963
24      0.010304
26      0.006410
16      0.003834
13      0.002097
5       0.001498
6       0.001378
8       0.001078
9       0.000719
11      0.000659
None    0.000479
7       0.000180
10      0.000180
4       0.000120
Name: next_purpose, dtype: float64 sample: 16693

Group 2 next purpose distribution:
 2       0.219247
15      0.172216
25      0.092679
19      0.076376
1       0.060765
17      0.047229
20      0.044857
24      0.039818
22      0.038731
21      0.034680
18      0.034285
14      0.024306
16      0.023713
23      0.020453
None    0.019267
12      0.014228
26      0.011857
3       0.011560
13      0.005829
11      0.002371
8       0.002174
5       0.000889
6       0.000692
7       0.000593
9       0.0

## Decision tree

In [29]:
group_probs = {}

# List of all groups
groups = [group1, group2, group3, group4, group5, group6, group7, group8, group9, group10, group11, group12, group13,
         group13, group14, group15, group16, group17, group18, group19]

for i, group in enumerate(groups, 1):
    # Compute the probability distribution of 'next_purpose' in this group
    probs = group['next_purpose'].value_counts(normalize=True)
    
    # Store the distribution in the dictionary
    group_probs[f'group{i}'] = probs

In [30]:
def predict_next_purpose(dep_time, purpose, employment, X_student):
    t_end = 76
    if dep_time > t_end:
        if purpose == 1:
            return 'None'
        else:
            return 1
    elif employment == 1:
        if purpose == 1:
            if dep_time <= 36:
                probs = group_probs['group1']
            elif dep_time <= 60:
                probs = group_probs['group2']
            elif dep_time <= 68:
                probs = group_probs['group3']
            else:
                probs = group_probs['group4']
        else: # purpose != 1
            if dep_time <= 56:
                probs = group_probs['group5']
            else:
                probs = group_probs['group6']
    elif X_student == 1:
        if purpose == 1:
            if dep_time <= 32:
                probs = group_probs['group7']
            elif dep_time <= 56:
                probs = group_probs['group8']
            else:
                probs = group_probs['group9']
        else: # purpose != 1
            if dep_time <= 32:
                probs = group_probs['group10']
            elif dep_time <= 56:
                probs = group_probs['group11']
            else:
                probs = group_probs['group12']
    else: # employment != 1 and X_student != 1
        if purpose == 1:
            if dep_time <= 32:
                probs = group_probs['group13']
            elif dep_time <= 36:
                probs = group_probs['group14']
            elif dep_time <= 60:
                probs = group_probs['group15']
            else:
                probs = group_probs['group16']
        else: # purpose != 1
            if dep_time <= 32:
                probs = group_probs['group17']
            elif dep_time <= 36:
                probs = group_probs['group18']
            elif dep_time <= 60:
                probs = group_probs['group19']
            else:
                probs = group_probs['group20']
            
    # Randomly choose a 'next_purpose' value from the probability distribution
    return np.random.choice(probs.index, p=probs.values)

In [31]:
# Sort the data
df_test = df_test.sort_values(['id', 'dep_time'])

# Shift the purpose column to get the next activity type
df_test['next_purpose'] = df_test.groupby('id')['purpose'].shift(-1)

# Fill NaN values with a special category indicating no further activity
df_test['next_purpose'].fillna('None', inplace=True)
df_test['next_purpose'] = df_test['next_purpose'].apply(lambda x: int(x) if x != 'None' else x)

In [32]:
df_next['predicted_next_purpose'] = df_next.apply(lambda row: predict_next_purpose(row['dep_time'], row['purpose'], row['employment'], row['X_student']), axis=1)
df_next.to_csv('df_train_ATM.csv',index = False)

In [33]:
df_test['predicted_next_purpose'] = df_test.apply(lambda row: predict_next_purpose(row['dep_time'], row['purpose'], row['employment'], row['X_student']), axis=1)
df_test.to_csv('df_test_ATM.csv',index = False)

In [34]:
import dill

# 保存你的函数和group_probs到一个字典
model_data = {'model': predict_next_purpose, 'group_probs': group_probs}

# 把字典保存到一个dill文件
with open('ATM.pkl', 'wb') as f:
    dill.dump(model_data, f)

# Test

In [35]:
acc = len(df_next[df_next['predicted_next_purpose'] == df_next['next_purpose']])/len(df_next)
print("overall_accuracy:",acc)
df1 = df_next[(df_next['next_purpose'] == 1)|(df_next['next_purpose'] == 2)|(df_next['next_purpose'] == 3)|(df_next['next_purpose'] == 'None')]
acc_man = len(df1[df1['predicted_next_purpose'] == df1['next_purpose']])/len(df1)
print("\nman_accuracy(including non):",acc_man)
df2 = df_next[(df_next['next_purpose'] != 1)&(df_next['next_purpose'] != 2)&(df_next['next_purpose'] != 3)&(df_next['next_purpose'] != 'None')]
acc_non_man = len(df2[df2['predicted_next_purpose'] == df2['next_purpose']])/len(df2)
print("\nnon_man_accuracy:",acc_non_man)

overall_accuracy: 0.4185663422190368

man_accuracy(including non): 0.577396421924515

non_man_accuracy: 0.06595942676344686


In [36]:
acc = len(df_test[df_test['predicted_next_purpose'] == df_test['next_purpose']])/len(df_test)
print("overall_accuracy:",acc)
df1 = df_test[(df_test['next_purpose'] == 1)|(df_test['next_purpose'] == 2)|(df_test['next_purpose'] == 3)|(df_test['next_purpose'] == 'None')]
acc_man = len(df1[df1['predicted_next_purpose'] == df1['next_purpose']])/len(df1)
print("\nman_accuracy(including non):",acc_man)
df2 = df_test[(df_test['next_purpose'] != 1)&(df_test['next_purpose'] != 2)&(df_test['next_purpose'] != 3)&(df_test['next_purpose'] != 'None')]
acc_non_man = len(df2[df2['predicted_next_purpose'] == df2['next_purpose']])/len(df2)
print("\nnon_man_accuracy:",acc_non_man)

overall_accuracy: 0.4152867715078631

man_accuracy(including non): 0.5719874281128795

non_man_accuracy: 0.06375637563756376
