# Data Science

- Student: Dmytro Geleshko
- Group: IP-91
- Var: 6

## Imports

In [129]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Config

In [119]:
# var 6
np.random.seed(6)
plt.rcParams["figure.figsize"] = (20, 8)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.3f}".format

## Functions

In [120]:
def lsm_create_f(size, pol_pow):
    """
    Creating F matrix:
    1, x, x^2, ..., x^PARAM_max_pow
    """
    
    F = np.ones((size, pol_pow + 1))
    for i in range(1, pol_pow + 1):
        # pow
        F[:, i] = np.power(np.arange(0, size), i)
    return F


def lsm_coef(y, F):
    """
    Returns matrix of coef. C.
    """

    # 1-d array to row vector
    Y = y.reshape(-1, 1)
    # calc
    FT = F.T
    FF = np.dot(FT, F)
    FFI = np.linalg.inv(FF)
    FFIFT = np.dot(FFI, FT)
    # return coef
    return np.dot(FFIFT, Y)


def lsm(y, pol_pow):
    """
    LSM
    y - 1d array of values,
    pol_pow - polynomial power of model
    """

    F = lsm_create_f(len(y), pol_pow)
    C = lsm_coef(y, F)
    # returning array, reshaped from row vector to 1d
    return np.dot(F, C).reshape(-1)

In [121]:
def find_anomalies(data, sliding_window_size, lsm_pow, std3):
    """
    Returns indexes of anomalies.
    """
    anomaly_index = []
    
    for i in range(0, len(data) - sliding_window_size):
        # new window
        window = data[i:(sliding_window_size + i)]
        # finding C matrix for current window
        coef = lsm_coef(window, lsm_create_f(sliding_window_size, lsm_pow))
        # prediction of next value
        y_hat = np.dot(lsm_create_f(sliding_window_size + 1, lsm_pow), coef).reshape(-1)[-1]
        # if difference between predicted and real value is more than 3*std => anomaly
        if abs(data[sliding_window_size + i] - y_hat) > std3:
            anomaly_index.append(sliding_window_size + i)
    return anomaly_index

In [122]:
def mse(data1, data2):
    return (np.square(data1 - data2)).mean()

# Data

## Load And Format Data

In [123]:
df = pd.read_excel("Data_Set_6.xlsx", index_col=[0, 1], thousands=",")

# replace missing values
df.replace("not avilable", np.NaN, inplace=True)
df.replace("n.a.", np.NaN, inplace=True)
df.replace(-1, np.NaN, inplace=True)

# lower the names of columns and indexes
df.columns = [name.lower() for name in df.columns]
df.index.set_names([name.lower() for name in df.index.names], inplace=True)


# sort by sale_id
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,january,february,march,april,may,june,july,august,september,october,november,december
sales_id,sales_by_region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,AUH,3469.000,,,3642.000,5803,5662.000,1896.000,2293.000,2583,5233.000,4421,4071.000
1,SHJ,5840.000,5270.000,4114.000,5605.000,4387,5026.000,4055.000,2782.000,4578,4993.000,2859,4853.000
1,UAQ,2967.000,2425.000,5353.000,3547.000,5027,4078.000,3858.000,1927.000,3527,4179.000,1571,5551.000
2,AUH,1328.000,4264.000,1574.000,2343.000,3826,4932.000,1710.000,3221.000,3381,1313.000,1765,1214.000
3,AUH,1722.000,956.000,1297.000,1984.000,2744,5793.000,2261.000,5607.000,2437,4328.000,3317,5390.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,UAQ,5325.000,1905.000,5049.000,1311.000,4146,1706.000,1689.000,3190.000,2915,2183.000,3301,4365.000
30,FUJ,3402.000,5283.000,2229.000,3758.000,1427,1057.000,5277.000,5231.000,3909,4345.000,5287,2638.000
30,FUJ,5549.000,1302.000,1929.000,2822.000,5379,1243.000,3075.000,4358.000,5106,2322.000,2409,1069.000
30,AJM,2832.000,5978.000,1684.000,1550.000,1194,3737.000,5779.000,4441.000,1213,3711.000,5384,1293.000


In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 98 entries, (1, 'AUH') to (30, 'AJM')
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   january    92 non-null     float64
 1   february   96 non-null     float64
 2   march      95 non-null     float64
 3   april      97 non-null     float64
 4   may        98 non-null     int64  
 5   june       95 non-null     float64
 6   july       97 non-null     float64
 7   august     96 non-null     float64
 8   september  98 non-null     int64  
 9   october    97 non-null     float64
 10  november   98 non-null     int64  
 11  december   96 non-null     float64
dtypes: float64(9), int64(3)
memory usage: 11.1+ KB


## Fill NaN

In [125]:
df = df.ffill(axis=1).bfill(axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 98 entries, (1, 'AUH') to (30, 'AJM')
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   january    98 non-null     float64
 1   february   98 non-null     float64
 2   march      98 non-null     float64
 3   april      98 non-null     float64
 4   may        98 non-null     float64
 5   june       98 non-null     float64
 6   july       98 non-null     float64
 7   august     98 non-null     float64
 8   september  98 non-null     float64
 9   october    98 non-null     float64
 10  november   98 non-null     float64
 11  december   98 non-null     float64
dtypes: float64(12)
memory usage: 11.1+ KB


## Cast to np.float32

In [127]:
df[df.columns] = df[df.columns].values.astype(np.float32)
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 98 entries, (1, 'AUH') to (30, 'AJM')
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   january    98 non-null     float32
 1   february   98 non-null     float32
 2   march      98 non-null     float32
 3   april      98 non-null     float32
 4   may        98 non-null     float32
 5   june       98 non-null     float32
 6   july       98 non-null     float32
 7   august     98 non-null     float32
 8   september  98 non-null     float32
 9   october    98 non-null     float32
 10  november   98 non-null     float32
 11  december   98 non-null     float32
dtypes: float32(12)
memory usage: 6.5+ KB


In [128]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,january,february,march,april,may,june,july,august,september,october,november,december
sales_id,sales_by_region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,AUH,3469.000,3469.000,3469.000,3642.000,5803.000,5662.000,1896.000,2293.000,2583.000,5233.000,4421.000,4071.000
1,SHJ,5840.000,5270.000,4114.000,5605.000,4387.000,5026.000,4055.000,2782.000,4578.000,4993.000,2859.000,4853.000
1,UAQ,2967.000,2425.000,5353.000,3547.000,5027.000,4078.000,3858.000,1927.000,3527.000,4179.000,1571.000,5551.000
2,AUH,1328.000,4264.000,1574.000,2343.000,3826.000,4932.000,1710.000,3221.000,3381.000,1313.000,1765.000,1214.000
3,AUH,1722.000,956.000,1297.000,1984.000,2744.000,5793.000,2261.000,5607.000,2437.000,4328.000,3317.000,5390.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,UAQ,5325.000,1905.000,5049.000,1311.000,4146.000,1706.000,1689.000,3190.000,2915.000,2183.000,3301.000,4365.000
30,FUJ,3402.000,5283.000,2229.000,3758.000,1427.000,1057.000,5277.000,5231.000,3909.000,4345.000,5287.000,2638.000
30,FUJ,5549.000,1302.000,1929.000,2822.000,5379.000,1243.000,3075.000,4358.000,5106.000,2322.000,2409.000,1069.000
30,AJM,2832.000,5978.000,1684.000,1550.000,1194.000,3737.000,5779.000,4441.000,1213.000,3711.000,5384.000,1293.000


## Finding optimal polynomial power for smoothing with lsm

errs = []
pow_range = range(1, PARAM_max_test_pow)
for power in pow_range:
    errs.append((np.square(data["quadratic_normal_anomalies_fix"] - lsm(data["quadratic_normal_anomalies_fix"], power))).mean())

plt.title("MSE")
plt.xlabel("Polynomial power")
plt.plot(pow_range, errs)
plt.show()