In [10]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from google.colab import drive

##Data processing

In [11]:
def getDataAndLabel(df, dataColumnName, labelColumnName):

  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  n = np.size(X)

  X_train = ['']*n
  y_train = ['']*n

  for i in range(n):
    X_train[i] += str(X[i])
    y_train[i] += str(y[i])

  return X_train, y_train

In [12]:
def getDataClass(df, dataColumnName, labelColumnName):
  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  train_classes = ['']*4

  n = np.size(X)

  for i in range(n):
    if (y[i] == 'B'):
      train_classes[0] += str(X[i]) + '. '
    
    if (y[i] == 'D'):
      train_classes[1] += str(X[i]) + '. '
    
    if (y[i] == 'I'):
      train_classes[2] += str(X[i]) + '. '
    
    if (y[i] == 'P'):
      train_classes[3] += str(X[i]) + '. '
    
  return train_classes

##Import data

In [13]:
# Import of the datasets
drive.mount('/content/drive')

df_train = pd.read_excel('/content/DATASET_downsampled.xlsx')
df_test = pd.read_excel('/content/Trainingskorpus_Final.xlsx')

# Imports without google drive
# df_train = pd.read_excel('DATASET_downsampled.xlsx')
# df_test = pd.read_excel('Trainingskorpus_Final.xlsx')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Label functions

In [14]:
def predict(X_train, y_train, X_test, y_test, tfidf):

  X_train_transformed = tfidf.fit_transform(X_train)
  X_test_transformed = tfidf.transform(X_test)

  lr = LogisticRegression(max_iter = 1000)
  lr.fit(X_train_transformed, y_train)
  y_pred = lr.predict(X_test_transformed)

  return y_pred

##Evaluation on Testset

In [15]:
X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')
tfidf = TfidfVectorizer(input = getDataClass(df_train, 'review', 'kano_labels'), stop_words = "english")

X_test, y_test = getDataAndLabel(df_test, 'review', 'kano_labels')

y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

acc = accuracy_score(y_test, y_pred)
prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred, average = 'macro')

print("accuracy: ", acc)
print("precision: ", prec)
print("recall: ", rec)
print("f1 score: ", f1)

accuracy:  0.6029593094944513
precision:  0.38779628836244895
recall:  0.47060025117354715
f1 score:  0.38829647694470637


##Evaluation on Testset divided by labels

In [16]:
X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')
tfidf = TfidfVectorizer(input = getDataClass(df_train, 'review', 'kano_labels'), stop_words = "english")

accuracy = []
precision = []
recall = []
f1_score = []

for label in ["B", "P", "D", "I"]:
  X_test, y_test = getDataAndLabel(df_test[df_test.kano_labels == label], 'review', 'kano_labels')
  print("------------------------------")
  print("evaluating label", label)
  print("------------------------------")

  y_pred = predict(X_train, y_train, X_test, y_test, tfidf)
  acc = accuracy_score(y_test, y_pred)
  prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

  print("accuracy: ", acc)
  print("precision: ", prec)
  print("recall: ", rec)
  print("f1 score: ", f1)

  accuracy.append(acc)
  precision.append(prec)
  recall.append(rec)
  f1_score.append(f1)


------------------------------
evaluating label B
------------------------------
accuracy:  0.6742286751361162
precision:  0.25
recall:  0.918557168784029
f1 score:  0.2013550135501355
------------------------------
evaluating label P
------------------------------
accuracy:  0.4151898734177215
precision:  0.25
recall:  0.8537974683544304
f1 score:  0.14669051878354203
------------------------------
evaluating label D
------------------------------
accuracy:  0.7263157894736842
precision:  0.25
recall:  0.9315789473684211
f1 score:  0.21036585365853658
------------------------------
evaluating label I
------------------------------
accuracy:  0.06666666666666667
precision:  0.25
recall:  0.7666666666666667
f1 score:  0.03125


##Consistent vs. inconsistent labels

In [17]:
# Import of 'Trainingskorpus_Final', which gives the final labeling
drive.mount('/content/drive')
testset1 = pd.read_excel('/content/Trainingskorpus_Final.xlsx')

# Import without google drive
# testset1 = pd.read_excel('Trainingskorpus_Final.xlsx')

# Import of 'TrainingskorpusBereinigt', which gives the initial labeling of two sometimes differing labels
from google.colab import drive
drive.mount('/content/drive')
testset2 = pd.read_excel('/content/Trainingskorpus_InitialLabels.xlsx')

# Import without google drive
# testset2 = pd.read_excel('Trainingskorpus_InitialLabels.xlsx')

testset2 = testset2.drop(columns = ['Unnamed: 0'])

# Merge the two sets so we have the final labels as well as the initial labels in the testset
testset = testset1.merge(testset2, left_on='review', right_on='review')

# Add a column 'equal' which is True if the two initial labels and false otherwise
equal = []

for label1, label2 in zip(testset['label 1'], testset['label 2']):
  if(label1 == label2):
    equal.append(True)
  else:
    equal.append(False)
  
testset['equal'] = equal

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')

X_test = testset.review
y_test = testset.kano_labels

y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision, Recall, f1 Score, Support: ", precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division=1))

pred_corr = []
for pred, label in zip(y_pred, y_test):
  pred_corr.append(pred == label)
print("Phi Coefficient: ", matthews_corrcoef(pred_corr, testset.equal))
print(confusion_matrix(pred_corr, testset.equal))

Accuracy:  0.6029593094944513
Precision, Recall, f1 Score, Support:  (0.38779628836244895, 0.47060025117354715, 0.38829647694470637, None)
Phi Coefficient:  0.192346421175206
[[149 495]
 [ 90 888]]


##10-fold cross-validation

In [19]:
# Import of the dataset
train_data = pd.read_excel('/content/DATASET_downsampled.xlsx')

# Import without google drive
# train_data = pd.read_excel('DATASET_downsampled.xlsx')

# 10 fold cross validation
n = 10
kf = KFold(n_splits=n, random_state = 42, shuffle = True)

# Lists to store the values for accuracy, precision, recall and f1-score for each label
resultsAcc = [[],[],[],[],[]]
resultsPrecision = [[],[],[],[],[]]
resultsRecall = [[],[],[],[],[]]
resultsf1 = [[],[],[],[],[]]

for train_index, val_index in kf.split(train_data):
  train_df = train_data.iloc[train_index]
  val_df = train_data.iloc[val_index]

  X_train, y_train = getDataAndLabel(train_df, 'review', 'kano_labels')

  # Evaluating on the val_df divided by labels
  for i, label in zip(range(4), ["B", "P", "D", "I"]):
    X_test, y_test = getDataAndLabel(val_df[val_df.kano_labels == label], 'review', 'kano_labels')

    y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

    resultsAcc[i].append(acc)
    resultsPrecision[i].append(prec)
    resultsRecall[i].append(rec)
    resultsf1[i].append(f1)

  # Evaluating on the whole val_df
  X_test, y_test = getDataAndLabel(val_df, 'review', 'kano_labels')

  y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

  acc = accuracy_score(y_test, y_pred)
  prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

  resultsAcc[4].append(acc)
  resultsPrecision[4].append(prec)
  resultsRecall[4].append(rec)
  resultsf1[4].append(f1)

In [20]:
for i, label in zip(range(5), ["basic", "performance", "delighter", "irrelevant", "overall"]):
  print("------------\n", label, "\n------------")
  print("Average accuracy: ", np.round(np.average(resultsAcc[i]), 3))
  print("Average precision: ", np.round(np.average(resultsPrecision[i]), 3))
  print("Average recall: ", np.round(np.average(resultsRecall[i]), 3))
  print("Average f1 score: ", np.round(np.average(resultsf1[i]), 3))

------------
 basic 
------------
Average accuracy:  0.809
Average precision:  0.267
Average recall:  0.949
Average f1 score:  0.238
------------
 performance 
------------
Average accuracy:  0.502
Average precision:  0.25
Average recall:  0.876
Average f1 score:  0.166
------------
 delighter 
------------
Average accuracy:  0.549
Average precision:  0.25
Average recall:  0.887
Average f1 score:  0.177
------------
 irrelevant 
------------
Average accuracy:  0.702
Average precision:  0.258
Average recall:  0.924
Average f1 score:  0.213
------------
 overall 
------------
Average accuracy:  0.64
Average precision:  0.637
Average recall:  0.641
Average f1 score:  0.638
