In [132]:
from decimal import Decimal
import pandas as pd
import numpy as np

# filename = 'data/qualcomm.xlsx'
# filename = 'data/Huawei.xlsx'
# filename = 'data/IBM.xlsx'
# filename = 'data/Microsoft.xlsx'
# filename = 'data/Intel.xlsx'
filename = 'data/Apple.xlsx'

class Model:
    constant = Decimal('0.446021')
    c1 = Decimal('0.051499')
    x1_c1 = Decimal('0.399772')
    x4_c1 = Decimal('0.420848')
    x11_c1 = Decimal('0.346955')
    x15_c1 = Decimal('0.40765')
    x16_c1 = Decimal('0.35787')

    c4 = Decimal('0.116825')
    x5_c4 = Decimal('-0.493432')
    x14_c4 = Decimal('0.667676')
    x16_c4 = Decimal('-0.516003')

    def __init__(self, filename):
        self.data_filename = filename

    def get_dataframe(self):
        df = pd.read_excel(self.data_filename)
#         df.dropna(how='all', inplace=True)
#         df.fillna(0, inplace=True)
#         df.drop([26], inplace=True)
        return df

    def get_convolution(self):
        df = pd.read_excel(self.data_filename)
        convolution = {}
        for i, col in enumerate(list(df.columns)[1:], start=1):
            avg_col = df[col].mean()
            convolution[f'x{i}'] = Decimal(avg_col)

        return convolution

    def get_row(self, start=None, stop=None):
        df = self.get_dataframe()
        col_names = ['x1', 'x4', 'x5', 'x7', 'x11', 'x14', 'x15', 'x16']
        if start and not stop:
            df_row = list(df.loc[df['Год'] == str(start)].values)
            row = {col: df_row[int(col[1:])] for col in col_names}
        elif start and stop:
            df_rows_interval = df.loc[
                (df['Год'] >= str(start)) & (df['Год'] <= str(stop))
            ]
            df_row = self.get_convolution(df_rows_interval)
            row = {col: df_row[col] for col in col_names}
        else:
            row = self.get_convolution()
            row = {k: v for k, v in row.items() if k in col_names}

        return row

    def calculate_y(self):
        convolution = self.get_convolution()
        y = self.constant + self.c1 * (
            self.x1_c1 * convolution['x1'] + self.x4_c1 * convolution['x4'] +
            self.x15_c1 * convolution['x15'] + self.x16_c1 * convolution['x16']
        ) + self.c4 * (
            self.x5_c4 * convolution['x5'] + self.x14_c4 * convolution['x14'] +
            self.x16_c4 * convolution['x16']
        )
        return y
    
    def get_cluster(self, method=None, **kwargs):
        if method == 'year':
            row = self.get_row(kwargs.get('year'))
        elif method == 'interval':
            row = self.get_row(
                kwargs.get('year_start'), stop=kwargs.get('year_stop')
            )
        else:
            row = self.get_row()

        clusters = {
            '1': {
                'x1': Decimal('605.563704'),
                'x4': Decimal('74.1007977'),
                'x5': Decimal('10.9912602'),
                'x7': Decimal('12.0261736'),
                'x11': Decimal('3.88481466'),
                'x14': Decimal('2.34036013'),
                'x15': Decimal('2.73474355'),
                'x16': Decimal('21.0625'),
            },
            '2': {
                'x1': Decimal('2036.70322'),
                'x4': Decimal('128.053402'),
                'x5': Decimal('12.6208238'),
                'x7': Decimal('22.9470241'),
                'x11': Decimal('4.65826331'),
                'x14': Decimal('11.9321632'),
                'x15': Decimal('7.83429687'),
                'x16': Decimal('28.5'),
            },
            '3': {
                'x1': Decimal('5350.08'),
                'x4': Decimal('228.594302'),
                'x5': Decimal('29.814385'),
                'x7': Decimal('34.5612084'),
                'x11': Decimal('11.0855555'),
                'x14': Decimal('8.79057304'),
                'x15': Decimal('11.6689716'),
                'x16': Decimal('94.5'),
            }
        }
#         clasters_dists = {'1': 0, '2': 0, '3': 0}
#         for cl_name, cl_centroid in clusters.items():
#             dist = np.linalg.norm(
#                 np.array(list(row.values())) -
#                 np.array(list(clusters[cl_name].values()))
#             )
#             clasters_dists[cl_name] = dist

#         for cl_name, dist in clasters_dists.items():
#             if dist == min([val for val in clasters_dists.values()]):
#                 return cl_name, dist

        counters = {'1': [], '2': [], '3': []}
        for x, x_val in row.items():
            dists = []
            for cl, cl_c in clusters.items():
                dist = np.linalg.norm(x_val - cl_c[x])
                # dist = distance.euclidean([x_val], [cl_c[x]])
                dists.append(dist)

            print(dists, min(dists))
            clust_num = dists.index(min(dists)) + 1
            counters[str(clust_num)].append(x)

        print(counters)
        for k, v in counters.items():
            if len(v) == max([len(c) for c in counters.values()]):
                return k, v

In [133]:
model = Model(filename)

In [134]:
model.get_dataframe()

Unnamed: 0,Год,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16
0,1993,58,,,,,,,,,,,,,,,17
1,1994,123,,,,,,,,,,,,,,,18
2,1995,279,,,,,,,,,,,,,,,19
3,1996,279,2.30112,27.72906,15.12648,,,,,,,,,,,,20
4,1997,280,2.58115,17.49007,10.45551,0.12844,,,,,1.05469,3.0,,,,,21
5,1998,340,0.49749,9.56501,6.90529,,0.48783,,1.46188,,0.0161,1.0,6658.0,0.250075,1.5939,0.17871,22
6,1999,302,1.61068,16.43912,13.83148,,0.84152,0.85224,2.66928,,0.0,0.0,6960.0,0.258621,2.2512,0.2278,23
7,2000,222,2.66454,27.06237,23.06217,,1.2882,1.42041,3.95274,,0.0,0.0,8568.0,0.256769,0.59664,0.28476,24
8,2001,310,0.03875,8.31265,9.33255,0.1023,0.6665,0.8742,1.7639,0.1178,0.12555,2.0,9603.0,0.260335,1.364,0.1581,25
9,2002,381,0.1547,13.66596,14.98924,0.2023,1.06148,1.47798,2.63942,0.28322,0.12376,5.0,10211.0,0.263148,0.22134,0.27132,26


In [135]:
conv = model.get_convolution()
conv

{'x1': Decimal('1343.96153846153856648015789687633514404296875'),
 'x2': Decimal('24.979001818181814087438397109508514404296875'),
 'x3': Decimal('121.001121363636372052496881224215030670166015625'),
 'x4': Decimal('139.9903295454545286702341400086879730224609375'),
 'x5': Decimal('2.2572527777777775526146797346882522106170654296875'),
 'x6': Decimal('4.78296299999999963148411552538163959980010986328125'),
 'x7': Decimal('14.2909615789473658509223241708241403102874755859375'),
 'x8': Decimal('10.07207700000000016871126717887818813323974609375'),
 'x9': Decimal('2.613375294117647573699514396139420568943023681640625'),
 'x10': Decimal('0.404936000000000018150814184991759248077869415283203125'),
 'x11': Decimal('3.350000000000000088817841970012523233890533447265625'),
 'x12': Decimal('44339.699999999997089616954326629638671875'),
 'x13': Decimal('0.32285639312361313901789117153384722769260406494140625'),
 'x14': Decimal('1.259847000000000161179514179821126163005828857421875'),
 'x15': Dec

In [136]:
y = model.calculate_y()
y

Decimal('29.97379473928498665430518264')

In [137]:
conv_for_clust = {k: v for k, v in conv.items() if k in ('x1', 'x4', 'x5', 'x7', 'x11', 'x14', 'x15', 'x16')}
conv_for_clust

{'x1': Decimal('1343.96153846153856648015789687633514404296875'),
 'x4': Decimal('139.9903295454545286702341400086879730224609375'),
 'x5': Decimal('2.2572527777777775526146797346882522106170654296875'),
 'x7': Decimal('14.2909615789473658509223241708241403102874755859375'),
 'x11': Decimal('3.350000000000000088817841970012523233890533447265625'),
 'x14': Decimal('1.259847000000000161179514179821126163005828857421875'),
 'x15': Decimal('4.33112349999999945993067740346305072307586669921875'),
 'x16': Decimal('29.5')}

In [138]:
cluster, params = model.get_cluster()
cluster, params
#[1807.0510371704534, 381.35235567209253, 2943.249775313835]

[Decimal('738.3978344615385664801578969'), Decimal('692.7416815384614335198421031'), Decimal('4006.118461538461433519842103')] 692.7416815384614335198421031
[Decimal('65.88953184545452867023414001'), Decimal('11.93692754545452867023414001'), Decimal('88.60397245454547132976585999')] 11.93692754545452867023414001
[Decimal('8.734007422222222447385320265'), Decimal('10.36357102222222244738532027'), Decimal('27.55713222222222244738532027')] 8.734007422222222447385320265
[Decimal('2.264787978947365850922324171'), Decimal('8.656062521052634149077675829'), Decimal('20.27024682105263414907767583')] 2.264787978947365850922324171
[Decimal('0.5348146599999999111821580300'), Decimal('1.308263309999999911182158030'), Decimal('7.735555499999999911182158030')] 0.5348146599999999111821580300
[Decimal('1.080513129999999838820485820'), Decimal('10.67231619999999983882048582'), Decimal('7.530726039999999838820485820')] 1.080513129999999838820485820
[Decimal('1.596379949999999459930677403'), Decimal('3.50

('1', ['x5', 'x7', 'x11', 'x14', 'x15'])

In [131]:
# Euclidean vectors distance
dists = []
# Huawei
# conv = {
#     'x1': Decimal('2412.52631578947375601273961365222930908203125'),
#     'x4': Decimal('67.4692589607142991781074670143425464630126953125'),
#     'x5': Decimal('0.82760124999999995498711768959765322506427764892578125'),
#     'x7': Decimal('6.83693876857142868175287730991840362548828125'),
#     'x11': Decimal('1.357142857142857206298458550008945167064666748046875'),
#     'x14': Decimal('13.4471415384615387011990605969913303852081298828125'),
#     'x15': Decimal('5.74289615384615270698986932984553277492523193359375'),
#     'x16': Decimal('18.5')
# }
# Qualcomm
# conv = {
#     'x1': Decimal('939.0869565217391254918766207993030548095703125'),
#     'x4': Decimal('42.44763382739130719301101635210216045379638671875'),
#     'x5': Decimal('4.17075655052631599772894333000294864177703857421875'),
#     'x7': Decimal('3.315030469500000354088342646718956530094146728515625'),
#     'x11': Decimal('2.54999999999999982236431605997495353221893310546875'),
#     'x14': Decimal('0.07687523809523810758292938771774061024188995361328125'),
#     'x15': Decimal('1.122730986190476176744823533226735889911651611328125'),
#     'x16': Decimal('20.5')
# }
# IBM
# conv = {
#     'x1': Decimal('5350.079999999999927240423858165740966796875'),
#     'x4': Decimal('228.5943015999999943232978694140911102294921875'),
#     'x5': Decimal('29.81438499999999436340658576227724552154541015625'),
#     'x7': Decimal('34.56120840000000526970325154252350330352783203125'),
#     'x11': Decimal('11.0833333333333339254522798000834882259368896484375'),
#     'x14': Decimal('8.7905730434782611837363219819962978363037109375'),
#     'x15': Decimal('11.66897159999999900037437328137457370758056640625'),
#     'x16': Decimal('94.5')
# }
# Intel
conv = {
    'x1': Decimal('1343.96153846153856648015789687633514404296875'),
    'x4': Decimal('139.9903295454545286702341400086879730224609375'),
    'x5': Decimal('2.2572527777777775526146797346882522106170654296875'),
    'x7': Decimal('14.2909615789473658509223241708241403102874755859375'),
    'x11': Decimal('3.350000000000000088817841970012523233890533447265625'),
    'x14': Decimal('1.259847000000000161179514179821126163005828857421875'),
    'x15': Decimal('4.33112349999999945993067740346305072307586669921875'),
    'x16': Decimal('29.5')
}
clusters = {
    '1': {
        'x1': Decimal('605.563704'),
        'x4': Decimal('74.1007977'),
        'x5': Decimal('10.9912602'),
        'x7': Decimal('12.0261736'),
        'x11': Decimal('3.88481466'),
        'x14': Decimal('2.34036013'),
        'x15': Decimal('2.73474355'),
        'x16': Decimal('21.0625'),
    },
    '2': {
        'x1': Decimal('2036.70322'),
        'x4': Decimal('128.053402'),
        'x5': Decimal('12.6208238'),
        'x7': Decimal('22.9470241'),
        'x11': Decimal('4.65826331'),
        'x14': Decimal('11.9321632'),
        'x15': Decimal('7.83429687'),
        'x16': Decimal('28.5'),
    },
    'IBM': {
        'x1': Decimal('5350.08'),
        'x4': Decimal('228.594302'),
        'x5': Decimal('29.814385'),
        'x7': Decimal('34.5612084'),
        'x11': Decimal('11.0855555'),
        'x14': Decimal('8.79057304'),
        'x15': Decimal('11.6689716'),
        'x16': Decimal('94.5'),
    }
}

for cl_name, cl_centroid in clusters.items():
    dist = np.linalg.norm(np.array(list(conv.values())) - np.array(list(clusters[cl_name].values())))
    dists.append(dist)
#     dist = 0
#     for x, x_val in conv.items():
#         dist += float(x_val - clusters[cl_name][x])**2

#     dists.append(dist ** 0.5)
    
print(dists)

[Decimal('741.4373863416136847149388559'), Decimal('693.0690735505970602471721000'), Decimal('4007.792592553266867881645109')]
