In [None]:
rom sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.feature_selection import RFECV
from sklearn.pipeline import make_pipeline, Pipeline



####Function and modules for data analysis and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression


#####Function and modules for deep learning models
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor

###Function and modules for time series models
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf



#####Function and modules for data preparation and visualization
# pandas, pandas_datareader, numpy and matplotlib
import numpy as np
import pandas as pd

#Seaborn for easier visualization
import seaborn as sns
from matplotlib import pyplot
from plotly.subplots import make_subplots
from pandas.plotting import scatter_matrix

from plotly import graph_objs as go
import matplotlib.pyplot as plt
import plotly.express as px


#Display plots in the notebook
%matplotlib inline

In [None]:
    
    ### EDA 

data_df = [
    go.Heatmap(
        x=x,
        y=y,
        z=z_heatmap,
        name = 'produziert <-> retourniert',
        xaxis = 'x',
        yaxis = 'y',
        colorscale = 'Portland', 
        hoverlabel = dict(namelength = 50)
    ),
    go.Bar(
        x = x_returned_Produk,
        y = y_returned_Produk,
        name = 'retourniert',
        orientation = 'h',
        xaxis = 'x2',
        marker = dict(
            color = 'rgba(34,34,34,0.2)'
        ),
    ),
    go.Bar(
        x = x_produced_Produkt,
        y = y_produced_Produkt,
        name = 'produziert',
        yaxis = 'y2',
        marker = dict(
            color = 'rgba(34,34,34,0.2)'
        ),
    ),
    go.Scatter(
        x = df1_total_produkt.df_year_produced,
        y = df1_total_produkt.df_propofproduction,
        yaxis = 'y3',
        name = 'Anteil an Monatsproduktion',
        hoverlabel = dict(namelength = 40)),
    go.Scatter(
        x = df_rate_returned.prop_neue_prod,
        y = df_rate_returned_neu_totalPro.year,
        xaxis = 'x3',
        orientation = 'h',
        name = 'Retourenanteil zu aktiven Produkten',
        hoverlabel = dict(namelength = 50))
        ]
layout = go.Layout(
    autosize = False,
    title = 'Beziehung zwischen Produktionsdatum und Retouren von Produkten',
    xaxis = dict(
        title = 'Produktionsdatum',
        zeroline = False,
        domain = [0,0.82],
        showgrid = False
    ),
    yaxis = dict(
        title = 'Retourendatum',
        zeroline = False,
        domain = [0,0.75],
        showgrid = False
    ),
    xaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    xaxis3 = dict(
        zerolin

layout = go.Layout(
    autosize = False,
    title = 'Beziehung zwischen Produktionsdatum und Retouren ',
    xaxis = dict(
        title = 'Produktionsdatum',
        zeroline = False,
        domain = [0,0.82],
        showgrid = False
    ),
    yaxis = dict(
        title = 'Retourendatum',
        zeroline = False,
        domain = [0,0.75],
        showgrid = False
    ),
    xaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    xaxis3 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False,
        overlaying='x2',
        side='top',
        tickformat = ',.2%'
    ),
    yaxis2 = dict(
        zeroline = False,
        domain = [0.74,1],
        showgrid = False
    ),
    yaxis3 = dict(
        zeroline = False,
        showgrid = False,
        domain = [0.75,1],
        overlaying='y2',
        side='right',
        tickformat = ',.1%'
        ),
    height = 800,
    width = 1280,
    bargap = 0,
    hovermode = 'closest',
    showlegend = False
)

fig = go.Figure(data=data_df, layout=layout)
        
######################################################################################################################

#### Aonomalien in Bstellungen basierend auf "Payment_Method" erkennen
        
def AnomalienPlots(df, features):
    fig = make_subplots(
                    rows=2,
                    cols=1,
                    row_heights=[0.7, 0.3],
                    print_grid=False,
                    vertical_spacing=0.02
                    )
    
    colors=['rgb(178,236,93)','rgb(169,237,109)']
    form=[None, 'dot', None, 'dot', None, 'dot', None, 'dot']
    widths=[3,4,3,4,3,4]
    
        
    df_line = df['avg_'+feature]
    df_UpBound = df['avg_'+features]+df['se_'+features]
    df_LowBound = df['avg_'+features]-df['se_'+features]
    
    fig.add_trace(go.Scatter(
                        x=df['Date'], y=df_line, fill=None, opacity=0.3,
                        mode='lines+markers', name=tb_label, line=dict(color='rgb(178,236,93)', width=4, dash=None),
                        line_shape='spline'
                        ),
                        row=1, col=1)
    fig.add_trace(go.Scatter(
                        x=df['Date'], y=df_UpBound, fill='tonexty', showlegend=False,
                        mode='lines', name=tb_label, line=dict(color='rgb(179,109,237)', width=1, dash=None)), row=1, col=1)
    fig.add_trace(go.Scatter(
                        x=df['Date'], y=df_LowBound, fill='tonexty', showlegend=False,
                        mode='lines', line=dict(color='rgb(178,236,93)', width=1, dash=None)), row=1, col=1)
    fig.update_layout(title={'text':'<b>Zeitverlauf von '+feature+ '</b>', 'x':0.40, 'y':0.95, 'font':{'size':27}},
                              xaxis_title='Date',yaxis_title=feature, height=710, legend={'orientation':'h', 'y':1.071, 'x':0, 'font':{'size':15}})
    fig = fig.to_html(include_plotlyjs=False,
                                full_html=False,
                                config={'displayModeBar': False,'editable': False},
                                default_width='99%')

    print('%html {0}'.format(fig))
        
        

    


# Fitting the Predictive Models

In [None]:
#### Classification

#Splitting the variables into predictor and target variables. We need to drop "Payment_Method" variables, 

X = df.drop(['Payment_Method'], axis=1)
y = df["Payment_Method"]

#Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Random Forest and Gradient Boosting (Anwendung von Pipeleine)


# Einrichten von Pipelines mit einer StandardScaler-Funktion zum Normalisieren der Variablen
pipelines = {
    'rf' : make_pipeline(StandardScaler(), 
                         RandomForestClassifier(random_state=42, class_weight='balanced')),
    'gb' : make_pipeline(StandardScaler(), 
                         GradientBoostingClassifier(random_state=42))
}

# Einrichten der Hyperparameter für den Random Forest
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.30]
}

#Setting up the hyperparameters for the Gradient Boost
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100, 200],
    'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [1, 3, 5]
}

#Erstellen von dictionary der Hyperparameter
hyperparameters = {
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters
}


# Erstellen eines leeren dictionary für angepasste Modelle
fitted_alternative_models = {}

# Looping through model pipelines, tuning each with GridSearchCV and saving it to fitted_logreg_models
for name, pipeline in pipelines.items():
    #Creating cross-validation object from pipeline and hyperparameters
    alt_model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1)
    
    #Fitting the model on X_train, y_train
    alt_model.fit(X_train, y_train)
    
    #Storing the model in fitted_logreg_models[name] 
    fitted_alternative_models[name] = alt_model
    
    #Printing the status of the fitting
    print(name, 'has been fitted.')

In [None]:
#Creating an empty dictionary for predicted models
predicted_alternative_models = {}

#Predicting the response variables and displaying the prediction score
for name, model in fitted_alternative_models.items():
    y_pred = model.predict(X_test)
    predicted_alternative_models[name] = accuracy_score(y_test, y_pred)

predicted_alternative_models

In [None]:
# confusion matrix
pd.crosstab(y_test, fitted_alternative_models['rf'].predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True

In [None]:
#Erstellen des Klassifizierungsberichts
print(classification_report(y_test, fitted_alternative_models['rf'].predict(X_test)))

In [None]:
## Prognose von Bestellungen
#Splitting the variables into predictor and target variables. We need to drop "Payment_Method" variables, 

X = df.drop(['numberOfOrders'], axis=1)
y = df["numberOfOrders"]

### Wir verwenden 80% des Datensatzes für die Modellierung und 20% für Tests

validation_size = 0.2
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]

####   Regression and tree regression algorithms


models = []
models.append(('LR', LinearRegression()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))


##Neural network algorithms
models.append(('MLP', MLPRegressor()))


###Ensemble models

# Boosting methods
models.append(('ABR', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))

# Bagging methods
models.append(('RFR', RandomForestRegressor()))
models.append(('ETR', ExtraTreesRegressor()))

num_folds = 10
scoring = 'mean_squared_error'

names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
 names.append(name)

 ## k-fold analysis:
 kfold = KFold(n_splits=num_folds, random_state=seed)
 cv_results = cross_val_score(model, X_train, Y_train, cv=kfold,scoring=scoring)
 kfold_results.append(cv_results)
    
 # Full Training period
 res = model.fit(X_train, Y_train)
 train_result = mean_squared_error(res.predict(X_train), Y_train)
 train_results.append(train_result)
 # Test results
 test_result = mean_squared_error(res.predict(X_test), Y_test)
 test_results.append(test_result)


In [None]:
## Let’s compare the algorithms by looking at the cross validation results

fig = pyplot.figure()
fig.suptitle('Algorithm Comparison: Kfold results')
ax = fig.add_subplot(111)
pyplot.boxplot(kfold_results)
ax.set_xticklabels(names)
fig.set_size_inches(15,8)
pyplot.show()

In [None]:
## training und  test Error
# Vergelich algorithms

fig = pyplot.figure()
ind = np.arange(len(names)) # the x locations for the groups

width = 0.35 # the width of the bars
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.bar(ind - width/2, train_results, width=width, label='Train Error')
pyplot.bar(ind + width/2, test_results, width=width, label='Test Error')
fig.set_size_inches(15,8)
pyplot.legend()
ax.set_xticks(ind)
ax.set_xticklabels(names)
pyplot.show()


In [None]:
## Zeitreihenanalyse basierend auf modelle: ARIMA and LSTM



def evaluate_Arima_model(arima_order):
 #predicted = list()
 modelARIMA=ARIMA(endog=Y_train,exog=X_train_ARIMA,order=arima_order)
 model_fit = modelARIMA.fit()
 error = mean_squared_error(Y_train, model_fit.fittedvalues)
 return error


# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(p_values, d_values, q_values):
    best_score, best_cfg = float("inf"), None
    for p in p_values:
           for d in d_values:
                for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
 print('ARIMA%s MSE=%.6f' % (order,mse))
                except:
                    continue
 print('Best ARIMA%s MSE=%.7f' % (best_cfg, best_score))


# Berechnung der  parameter
p_values = [0, 1, 2]
d_values = range(0, 2)
q_values = range(0, 2)
evaluate_models(p_values, d_values, q_values)
