In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sys import platform
import numpy as np

In [None]:
df1 = pd.read_excel("Data/Web_Teaser_Bewertung_01.xlsx")#, sheet_name=None)

In [None]:
df2 = pd.read_excel("Data/Web_Teaser_Bewertung_02.xlsx")#, sheet_name=None)

In [None]:
df3 = pd.read_excel("Data/Web_Teaser_Bewertung_03.xlsx")#, sheet_name=None)

In [None]:
dfs = [df1, df2, df3]

In [None]:
target_names = ["accent colour", "background colour", "font colour", "font contrast", "space", "roundness"]

In [None]:
def split_input_target(df):
    header = df.iloc[0]
    df = df.rename(columns=df.iloc[0]).iloc[1:201]
    if platform == "linux" or platform == "linux2":
        df = df.drop(columns="teaser_name")
    target_names = ["accent colour", "background colour", "font colour", "font contrast", "space", "roundness"]
    targets = df[target_names]
    inputs = df.drop(columns=target_names)
    inputs = inputs.astype(int)
    return inputs, targets

In [None]:
inputs = []
targets = []
for df in dfs:
    input_df, target_df = split_input_target(df)
    inputs.append(input_df)
    targets.append(target_df)

In [None]:
inputs1, targets1 = split_input_target(df1)
inputs2, targets2 = split_input_target(df2)

## Check data integrity of target

In [None]:
(targets1 != targets2).sum()

In [None]:
mask = (targets1["accent colour"] != targets2["accent colour"])

In [None]:
#targets1[mask]

In [None]:
#targets2[mask]

## Show deviation of responses:

In [None]:
# Inpect data:
inputs[2]

In [None]:
# Calculate mean
mean_input = inputs[0].copy()
for inp in inputs[1:]:
    mean_input += inp.copy()
mean_input /= len(inputs)

In [None]:
mean_input.mean()

In [None]:
# Calculate std:
std_input = (inputs[0].copy() - mean_input) ** 2
for inp in inputs[1:]:
    std_input += (inp.copy() - mean_input) ** 2
std_input /= len(inputs)
std_input = std_input ** 0.5

In [None]:
std_input.mean()

In [None]:
plt.plot(mean_input.mean())
plt.errorbar(range(len(mean_input.mean())), mean_input.mean(),yerr=std_input.mean(), fmt='-o')
plt.ylim(-10, 10)
plt.xticks(rotation=60)
plt.title("Mean deviation of responses with standard deviation")

## Correlation between input columns

In [None]:
corr1 = inputs[0].corr()
corr2 = inputs[1].corr()
corr3 = inputs[2].corr()

corr_total = inputs[0].append(inputs[1]).append(inputs[2]).corr()

In [None]:
fig = plt.figure(figsize = (20, 20)) # width x height
ax1 = fig.add_subplot(331) # row, column, position
ax2 = fig.add_subplot(332)
ax3 = fig.add_subplot(333)

sns.heatmap(corr1, ax=ax1, vmin=-1, vmax=1,  annot=True, annot_kws={'fontsize': 12}).set_title('Werkstudent 1', fontsize =20)
sns.heatmap(corr2, ax=ax2, vmin=-1, vmax=1,  yticklabels=False,  annot=True, annot_kws={'fontsize': 12}).set_title('Werkstudent 2', fontsize =20) 
sns.heatmap(corr3, ax=ax3, vmin=-1, vmax=1,  yticklabels=False, annot=True, annot_kws={'fontsize': 12}).set_title('Matt', fontsize =20) 

In [None]:
fig = plt.figure(figsize = (8, 8)) # width x height
ax1 = fig.add_subplot(111)  # row, column, position

sns.heatmap(corr_total, ax=ax1, vmin=-1, vmax=1, square=True, annot=True, annot_kws={'fontsize': 12}).set_title('Combined correlation', fontsize =20)

# Sklearn tests:

In [None]:
targets[0]

In [None]:
# Check NANs
pd.concat(targets).isnull().mean()

In [None]:
dataset = pd.concat((pd.concat(inputs), pd.concat(targets)), axis=1)

In [None]:
dataset = dataset.dropna(subset=["roundness"])

In [None]:
inputs_np = dataset.drop(columns=target_names).to_numpy()
inputs_np.shape

In [None]:
# preprocess:
from sklearn.preprocessing import StandardScaler
inputs_np = StandardScaler().fit_transform(inputs_np)

In [None]:
pd.options.mode.chained_assignment = None  # default='warn''

In [None]:
# Extract target values that we want to predict:
used_targets = ["roundness"]
#used_targets = ["space", "font contrast", "roundness"]
targets = dataset[target_names]
targets["space"] = targets["space"].astype("category").cat.codes
targets["font contrast"] = targets["font contrast"].astype("category").cat.codes
targets["roundness"] = targets["roundness"].astype("category").cat.codes
targets_np = targets[used_targets].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import sklearn
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [None]:
print(inputs_np.shape, targets_np.shape)

In [None]:
def apply_classifier(inputs, targets, classifier):
    X_train, X_test, Y_train, Y_test = train_test_split(inputs_np, targets_np)
    classifier.fit(X_train, Y_train)
    Y_pred = classifier.predict(X_test)
    #print(sklearn.metrics.classification_report(Y_test, Y_pred))
    accuracy = classifier.score(X_test, Y_test)
    print("Accuracy: ", accuracy)
    return classifier

In [None]:
#classifier = SVC()
classifier = MultiOutputClassifier(SVC())
classifier = apply_classifier(inputs_np, targets_np, classifier)

In [None]:
classifier = MultiOutputClassifier(MLPClassifier(solver='adam', max_iter=1000, batch_size=32))
classifier = apply_classifier(inputs_np, targets_np, classifier)

In [None]:
classifier = MultiOutputClassifier(RandomForestClassifier())
classifier = apply_classifier(inputs_np, targets_np, classifier)