In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn import linear_model

df=pd.read_excel("Complete-dataset.xlsx")

### Note
Dit model is een alternatief voor de lineare regressie model. De bunsiness & data understanding zijn te vinden in de Jupyter notebook van de lineare regressie

# Data preparation

In [2]:
manufacturers = df.groupby("Manufacturer")
manufacturers.size().nlargest(20)

Manufacturer
Siemens              982
Bio-Rad              788
Roche                618
Beckman Coulter      384
Tosoh                349
Abbott               310
Sebia                203
Trinity Biotech      188
Vitros               162
Alere                121
Arkray                95
Axis-Shield           57
Roche Diagnostics     56
Menarini              52
Primus                42
Roche/Hitachi         42
Bayer                 36
Metrika               30
Olympus               30
Dade Behring          24
dtype: int64

In [None]:
df.columns = df.columns.str.replace('Total Error', 'Total')
df.columns

In [None]:
list_top10 = df['Manufacturer'].value_counts()[:10].index.tolist()
top10_manufacturers = df.loc[df['Manufacturer'].isin(list_top10)]
top10_manufacturers['Manufacturer'].value_counts()
top10_manufacturers = top10_manufacturers.dropna()

In [None]:
sns.lineplot(x="Year", y="Total",
             hue="Manufacturer", 
             data=top10_manufacturers)

# Modeling

#model regressie trainen en testen
#R2 zegt iets over de bruikbaarheid van je model
traindata, testdata = train_test_split(Siemens, train_size=0.65, test_size=0.35, random_state=42)

#eerst afhankelijke variabele, dus hierbij 'Manufacturer'
mod1 = ols(formula='Year ~ Total', data=traindata).fit()
print(mod1.summary())

In [None]:
# traindata, testdata = train_test_split(df, train_size=0.65, test_size=0.35, random_state=42)
df=df.dropna()
X_train, X_test, Y_train, Y_test = train_test_split(df["Year"].values, df["Manufacturer"].values, test_size = .20, random_state = 40)


# for name, data in manufacturers:
#     model=ols("Manufacturer ~ Total", data=traindata)
#     # results = model.fit(traindata, testdata)
#     # predicted=results.predict(testdata) 
#     results =model.fit(X_train, Y_train)
#     predicted = results.predict(X_test)
#     residuals = testdata['Manufacturer'] - predicted
#     residual_data = pd.DataFrame({'Manufacturer': testdata['Manufacturer'], 'Total': testdata['Total']})
#     print(residual_data)
#     sns.lineplot(residual_data,x='Manufacturer', y='Total')

#https://machinelearningmastery.com/make-predictions-scikit-learn/

In [None]:
#
# Check for stationarity of the time-series data
# We will look for p-value. In case, p-value is less than 0.05, the time series
# data can said to have stationarity
#
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_pacf
manufacturers = top10_manufacturers.groupby("Manufacturer")

for name, data in manufacturers:
    
    df_stationarityTest = adfuller(data['Total'], autolag='AIC')
    # Check the value of p
    print("P-value: ", df_stationarityTest[1])
    pacf = plot_pacf(data['Total'], title=name)

In [None]:
from statsmodels.tsa.ar_model import AutoReg
# Create training and test data
for name, data in manufacturers:
    train_data = data['Total'][:len(df)-100]
    test_data = data['Total'][len(df)-100:]
    #
    # Instantiate and fit the AR model with training data
    #
    ar_model = AutoReg(train_data, lags=1).fit()
    #
    # Print Summary
    #
    print(ar_model.summary())
    

In [None]:
for name, data in manufacturers:
    tempdf = df[['Year', 'Total']].dropna()

    X = tempdf.iloc[:, :-1].values
    Y = tempdf.iloc[:, 1].values

    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)

    print(regressor.intercept_)
    from matplotlib import pyplot

    
for name, data in manufacturers:
    # line plot for Total Error with seaborn
    # if name == "Abbott":
        # sns.set(rc = {'figure.figsize':(25,8)})
        # l = sns.lineplot(x='Year', y='Total Error', data=data)
        # sns.lineplot(x='Year', y={regressor.intercept_}, data=data)
        # l.set_title(name)
        # plt.show()
        pyplot.plot('Year','Total Error',data=tempdf, color='red')
        pyplot.plot('Year','Total Error', data=regressor.intercept_)