Vizualization of the train test split 

### Importing libraries

In [None]:
import pandas as pd
from aux_functions import split_data
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

### Processing the file

In [None]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')

In [None]:
# Turning the time:timestamp into a datetime object
with_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f%z')
without_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S%z')
df['time:timestamp'] = with_nanosec.fillna(without_nanosec)

In [None]:
# Shortening the date labels to only show the year-month-day for plotting
df['time:timestamp'] = df['time:timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
train_subset, test_subset = split_data(df, 0.80)

### Scatter Plot

In [None]:
# First time stamp in the test set
x_seperation = test_subset['time:timestamp'].iloc[0]

# First case:concept:name in the test set
y_seperation = test_subset['case:concept:name'].iloc[0]
x_seperation, y_seperation

In [None]:
# Plotting the Dataset Before Removing the Overlapping Cases

plt.figure(figsize=(24, 16))
plt.gca().yaxis.set_major_locator(MultipleLocator(3000))
plt.gca().xaxis.set_major_locator(MultipleLocator(20000))

plt.xlabel('Time', fontsize=20)
plt.ylabel('Case Concept Name', fontsize=20)
plt.title('The Dataset Before Removing the Overlapping Cases', fontsize=24)
plt.xticks(rotation=45)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.tick_params(axis='both', which='minor', labelsize=15)

plt.scatter(df['time:timestamp'], df['case:concept:name'], c=df['time:timestamp'].rank(pct=True), cmap='winter', label='All', alpha=0.1)
plt.axvline(x=x_seperation, color='r', linestyle='--', linewidth=3)
plt.axhline(y=y_seperation, color='r', linestyle='--', linewidth=3)
plt.gca().invert_yaxis()
plt.tight_layout()

# Saving the plot
plt.savefig('figs/overlapping_split.png')

plt.show()

In [None]:
# concatenating the train and test sets
train_test_concat = pd.concat([train_subset, test_subset])

In [None]:
# Plotting the first 1000 cases for the whole dataset not only train and test 

plt.figure(figsize=(24, 16))
plt.gca().yaxis.set_major_locator(MultipleLocator(3000))
plt.gca().xaxis.set_major_locator(MultipleLocator(20000))
# reducing the size of the scatters

plt.scatter(train_test_concat['time:timestamp'], train_test_concat['case:concept:name'], c=train_test_concat['time:timestamp'].rank(pct=True), cmap='winter', label='All', alpha=0.1)

plt.axvline(x=x_seperation, color='r', linestyle='--', linewidth=3)
plt.axhline(y=y_seperation, color='r', linestyle='--', linewidth=3)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.tick_params(axis='both', which='minor', labelsize=15)
plt.xlabel('Time', fontsize=20)
plt.ylabel('Case Concept Name', fontsize=20)
plt.title('The Train and Test Sets After Removing the Overlapping Cases', fontsize=24)
plt.xticks(rotation=45)
plt.gca().invert_yaxis()
plt.tight_layout()

# Saving the plot 
plt.savefig('figs/removed_overlapping_split.png', bbox_inches='tight')

plt.show()