# Data Exploration

In [None]:
import logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger('baseline')

In [None]:
import pandas as pd
# Disable warnings
pd.set_option('mode.chained_assignment', None)

from scrabu.commons import json_to_df
from scrabu.commons import files_to_df
from scrabu.commons import count_missing_values
from scrabu.commons import fill_missing_values
from scrabu.commons import create_target_columns
from scrabu.commons import remove_unused_columns

### Load the data into a dataframe

In [None]:
df = files_to_df(path='../../data')

### Are there any missing values?

In [None]:
count_missing_values(df)

Column "ort" has many missing values. We fill them with "Unbekannt"

In [None]:
df = fill_missing_values(df)

Check again if there are any missing values

In [None]:
count_missing_values(df)

### Data Stats

In [None]:
df.shape

Number of unique shipment numbers

In [None]:
len(df.shipment_number.unique())

### Investigate Data Types

In [None]:
df.info()

Everything regarding data types seems to be fine! No need for conversion

### Prepare data for exploration and machine learning

- Given the features "datum", "status" and "ort" we want to predict the next "datum" and hopefully also the next "status" and the next "ort"
- We create 3 new columns. "datum_next", "ort_next" and "status_next".

In [None]:
df = create_target_columns(df)

In [None]:
df.head()

In [None]:
df.info()

### Data Stats

In [None]:
count_missing_values(df)

In [None]:
df.shape

Number of unique shipment numbers

In [None]:
len(df.shipment_number.unique())

In [None]:
len(df.status.unique())

In [None]:
len(df.ort.unique())

### How many different final delivery states exist?

In [None]:
final_delivery_states = df.groupby(['shipment_number']).tail(1)[["status"]]["status"].unique()
print(final_delivery_states)

### How many different initial delivery states exist?

In [None]:
initial_delivery_states = df.groupby(['shipment_number']).head(1)[["status"]]["status"].unique()
print(initial_delivery_states)

### Distribution of final delivery states

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.xticks(rotation='vertical')

df[df["status"].isin(final_delivery_states)]["status"].hist(figsize=(15,5))

### Distribution of initial states

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.xticks(rotation='vertical')

df[df["status"].isin(initial_delivery_states)]["status"].hist(figsize=(10,5))

### Distribution of all states

These are the most common states

In [None]:
pd.DataFrame(df["status"].value_counts()).head(10)

These are the least common states. What we see here is that there are some states that are basically the same and the following pattern:

"Die Sendung liegt in der ........ zur Abholung bereit"

But the address is each time different. This leads to too many states, which also makes machine learning hard. We ignore the address part and convert all these states to a single one.

In [None]:
pd.DataFrame(df["status"].value_counts()).tail(10)

In [None]:
pd.DataFrame(df["status"].value_counts()).head(10)

In [None]:
pd.DataFrame(df["status"].value_counts()).tail(10)

In [None]:
len(df["status"].unique())

In [None]:
len(df["status_next"].unique())

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.xticks(rotation='vertical')
df["status"].hist(figsize=(15,5))

We introduces two new states to status_next: SuccessDelivery, FailedDelivery

### Distribution of the locations

In [None]:
len(df["ort"].unique())

In [None]:
pd.DataFrame(df["ort"].value_counts()).head(5)

In [None]:
pd.DataFrame(df["ort"].value_counts()).tail(5)

Notice: One idea here could be to merge some locations. 

### Remove unused columns

In [None]:
df = remove_unused_columns(df)

In [None]:
df.head()

### Distribution of event times

In [None]:
counts = df['datum'].dt.dayofweek.value_counts(sort=False)
plt.bar(counts.index,counts)
plt.show()

We see in the above plot which days of the week have the highest amount of events. Sunday has for example the least amount of events, as expected.

In [None]:
counts = df['datum'].dt.hour.value_counts(sort=False)
plt.bar(counts.index,counts)
plt.show()

In the above plot we see the distribution of the events over 24 hours of a day

### Time diff between events

In [None]:
from scrabu.commons import add_time_diff_column
df = add_time_diff_column(df)

Notice that for is_final_event=1, the datum_diff does not make any sense. Do not use it in the analysis

In [None]:
df[df["is_final_event"] == 0]['datum_diff'].astype('timedelta64[h]').plot.hist()

We observe two interesting things here. There are events with zero time diff and there are events with very big time diffs.

### Events with very small time diffs

In [None]:
df[(df["is_final_event"] == 0.0) & (df['datum_diff'].astype('timedelta64[s]') == 0)] 

### Events with huge time diffs

In [None]:
df[(df["is_final_event"] == 0.0) & (df['datum_diff'].astype('timedelta64[h]') > 150)] 

### Distribution of time diff for various states

In [None]:
from scrabu.commons import add_time_diff_in_hours
df = add_time_diff_in_hours(df)

In [None]:
df_temp = df[df["is_final_event"] == 0][["datum_diff_h", "status"]]
df_temp.reset_index()
df_temp.index = range(0,df_temp.shape[0])
df_temp.head()

In [None]:
%matplotlib inline
df_temp["datum_diff_h"].hist(by=df_temp['status'], layout=(len(df_temp["status"].unique()), 1), figsize=(10,80))
plt.tight_layout()
df.drop(columns=["datum_diff_h"], inplace=True)

In [None]:
df.head()

### Distribution of time diff for various locations

In [None]:
df["datum_diff_h"] = df['datum_diff'].astype('timedelta64[h]')

In [None]:
df_temp = df[df["is_final_event"] == 0][["datum_diff_h", "status", "ort"]]
df_temp.reset_index()
df_temp.index = range(0,df_temp.shape[0])
df_temp.head()

In [None]:
%matplotlib inline
df_temp["datum_diff_h"].hist(by=df_temp['ort'], layout=(len(df_temp["ort"].unique()), 1), figsize=(10,140))
plt.tight_layout()
df.drop(columns=["datum_diff_h"], inplace=True)

### Distribution of time diff for various locations and states

In [None]:
df["datum_diff_h"] = df['datum_diff'].astype('timedelta64[h]')

In [None]:
df_temp = df[df["is_final_event"] == 0][["datum_diff_h", "status", "ort"]]
df_temp.reset_index()
df_temp.index = range(0,df_temp.shape[0])
df_temp.head()

In [None]:
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline


grouped = df_temp.groupby(["status", "ort"])

for name, group in grouped:
    fig = plt.figure()
    fig.suptitle(str(name) + ": #{}".format(len(group)))
    matplotlib.pyplot.hist(group.datum_diff_h)
    plt.show()
    
df.drop(columns=["datum_diff_h"], inplace=True)

In [None]:
df.head()