# Initial data exploration
Data is available in a `CSV` file. In this module we:

* Show how to load src module, containing additional functions for data manipulation
* Load the data from the file, add header and parse date-time column
* Show timestamp inconsistencies of the data
* Analyse every timeseries in the data and classify if they have Gaussian distribution or not (relevant for some anomaly detection stuff)

In [None]:
# imports
# main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# skewness and kurtosis
from scipy.stats import skew, skewtest, kurtosis

In [None]:
# install python package from the src/ folder (then you need to restart the kernel!)
# not currently used in this file, but some functions should be copied to src/
!cd ../.. & python setup.py build & python setup.py install

In [None]:
# import the current data module (from src/)
import src

In [None]:
# define data filename
FILENAME = "../../../data/raw/continental/continental_preliminary.csv"

In [None]:
# import data
df = pd.read_csv(FILENAME, header=0,
    names=[
        "Id",
        "Timestamp",
        "SerialNumber",
        "Station",
        "StationType",
        "StationNumber",
        "Material",
        "TextDescription",
        "TestValue",
        "TestResult",
        "USL",
        "LSL",
        "Format"
    ],
    parse_dates=["Timestamp"]
)

In [None]:
# check some basic data on the dataframe
df.describe()

In [None]:
# there seems to be something wrong with the times
# there should only be data for a singled day; the timestamps tell another story
df["Timestamp"].dt.date.unique()

In [None]:
# we have 2 station types in the data
# we have 19 stations
# in all these stations we keep 9932 sensors
# some stations have a couple of tests, final ST station has 5000+ checks
df["TextDescription"].unique().shape[0]
df["Station"].unique()

In [None]:
# fast plot of timestamps converted to unixts; something is wrong here
screwing_df = df[df["Station"] == "SCREWING"]
torque1 = screwing_df[screwing_df["TextDescription"] == "Torque 1 value"]
plt.plot(torque1["Timestamp"].values.astype(np.int64) // 10 ** 9)

In [None]:
# plot a nice figure of data
plt.figure()
plt.plot(torque1["TestValue"].values)
plt.plot(torque1["USL"].values)
plt.plot(torque1["LSL"].values)

In [None]:
def draw_histogram(values, title):
    """
    Draws a histogram of a list of values and adds a title.
    """
    min = values.min()
    max = values.max()

    bins = np.arange(min, max, (max - min) / 12)

    fig, ax = plt.subplots(1, 1)
    plt.title(title)
    ax.hist(values, bins=bins)

In [None]:
def is_normal_distribution(values):
    """
    The function estimates whether an array from values has a normal distribution accoring to:
    - it has to have at least 8 examples (if not, it is classified as non-normal)
    - it has to have pvalue in the skewness test higher that 0.05    
    - it has to have an absolute kurtosis value greater than 0.5
    - it has to have more than only one value in the data
    """
    # do we have enough data?
    if (values.shape[0] < 8):
        return False
    # skewness and kurtosis
    pvalue = skewtest(values).pvalue
    # kurtosis 
    kurt = kurtosis(values)

    # do we only have a single value point in the data
    if (values.min() == values.max()):
        return True

    if (pvalue < 0.05):
        return False

    if (np.abs(kurt) > 0.5):
        return False

    return True

In [None]:
def show_histogram_if_not_normal(values):
    """
    Show histogram data only if the distribution s not normal (not used).
    """
    if not is_normal_distribution(values):
        draw_histogram(values)

In [None]:
def transverse_timeseries(df):
    """
    Transverse through all the sensors in the data frame, and classify the distribution (either normal or non-normal).
    Optionally, one can also draw histograms. If you uncomment this, also change the transversing as there are 
    ~11.000 different sensors. Data on the classification of time series is written in 2 log files: normal.log and
    not_normal.log.
    """
    stations = df["Station"].unique()
    for station in stations:        
        print("Checking:", station)
        station_df = df[df["Station"] == station]
        checks = station_df["TextDescription"].unique()       
        if (checks.shape[0] > 10000):
            print("Skipping; too many tests: ", checks.shape[0])
            continue
        for check in checks:
            # extract values
            check_df = station_df[station_df["TextDescription"] == check]
            values = check_df["TestValue"].values
            if not is_normal_distribution(values):                         
                with open("not_normal.log", "a") as fo:
                    fo.write("Not normal distribution: " + station + " - " + check + "\n")
                print("Not normal distribution:", check)
                # draw_histogram(values, station + " - " + check)
            else:
                with open("normal.log", "a") as fo:
                    fo.write("Normal distribution: " + station + " - " + check + "\n")
                print("Normal distribution:", check)    

In [None]:
transverse_timeseries(df)