In [86]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Integrate and view the datset supply chain and remove the columns that are not required

data = pd.read_csv('C:/Users/doehr/Documents/GitHub/Project_supply_chain/supply_chain_project_trustpilot_advanced_merge.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,Company,Name,Rating_number_customer,Heading,Comment,Stars,Invitation,Dates
0,0,skatedeluxe,Sandra,2,Jederzeit wieder,"<p class=""typography_body-l__v5JLj typography_...",5,Auf Einladung,5. März 2025
1,1,skatedeluxe,customer,2,Schnelle Lieferung,No comment,5,Auf Einladung,5. März 2025
2,2,skatedeluxe,Dexter,1,Bester Service und top Qualität,"<p class=""typography_body-l__v5JLj typography_...",5,Auf Einladung,4. März 2025
3,3,skatedeluxe,Stephan Lameck,1,Schnelligkeit,"<p class=""typography_body-l__v5JLj typography_...",5,Auf Einladung,4. März 2025
4,4,skatedeluxe,Fritz Brack,2,Super Service,"<p class=""typography_body-l__v5JLj typography_...",5,Auf Einladung,3. März 2025


In [88]:
# Define a function to clean each comment
def extract_comment(text):
    if isinstance(text, str):
        # Find text between the first ">" and the next "<"
        matches = re.findall(r'>([^<]+)<', text)
        if matches:
            return matches[0]
    return text  # If not a string or no match, return as is

# Apply the cleaning function to the Comment column
data['Comment'] = data['Comment'].apply(extract_comment)


In [89]:
data['Text'] = data['Heading'] + ' ' + data['Comment']
data = data.drop(['Name','Heading','Comment'], axis =1)

In [90]:
data['Text'] = data['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data["Text"] = data["Text"].str.lower()

In [91]:
data.head()

Unnamed: 0.1,Unnamed: 0,Company,Rating_number_customer,Stars,Invitation,Dates,Text
0,0,skatedeluxe,2,5,Auf Einladung,5. März 2025,jederzeit wieder sehr schnelle lieferung gutes...
1,1,skatedeluxe,2,5,Auf Einladung,5. März 2025,schnelle lieferung no comment
2,2,skatedeluxe,1,5,Auf Einladung,4. März 2025,bester service und top qualität der bestellvor...
3,3,skatedeluxe,1,5,Auf Einladung,4. März 2025,schnelligkeit ausgefallene produkte
4,4,skatedeluxe,2,5,Auf Einladung,3. März 2025,super service super service extrem schnelle li...


In [None]:
#stopword filtering; for the moment there are no other stopword added; perhaps we should change and add some stopwords


stop_words = set(stopwords.words('german'))
data['Text'] = data['Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word not in stop_words]))
data.head(5)

Unnamed: 0.1,Unnamed: 0,Company,Rating_number_customer,Stars,Invitation,Dates,Text
0,0,skatedeluxe,2,5,Auf Einladung,5. März 2025,jederzeit schnelle lieferung gutes preisleistu...
1,1,skatedeluxe,2,5,Auf Einladung,5. März 2025,schnelle lieferung no comment
2,2,skatedeluxe,1,5,Auf Einladung,4. März 2025,bester service top qualität bestellvorgang unk...
3,3,skatedeluxe,1,5,Auf Einladung,4. März 2025,schnelligkeit ausgefallene produkte
4,4,skatedeluxe,2,5,Auf Einladung,3. März 2025,super service super service extrem schnelle li...


In [None]:
#lemmatization and tokenization with Tfidf
# I am not really sure if the tokenization is correct, because Paul had other variables.



wordnet_lemmatizer = WordNetLemmatizer()

def lemmatization(words) :
    output = []
    for string in words :
        lemma = wordnet_lemmatizer.lemmatize(string)
        if (lemma not in output) : output.append(lemma)
    return output


# Separate the explanatory variable from the variable to be predicted
X_vf, y_vf = data.Text, data.Stars

# To perform the lemmatization, we separate the sentences and put them back together.
X_vf = X_vf.str.split()

for i in range (0, len(X_vf)):
    X_vf[i] = lemmatization(X_vf[i])
    X_vf[i] = ' '.join(X_vf[i])


# Separate the dataset into training and test data 
X_train_vf, X_test_vf, y_train_vf, y_test_vf = train_test_split(X_vf, y_vf, test_size=0.2, random_state = 42)

vec_vf = TfidfVectorizer()

# Update X_train_vf and X_test_vf values
X_train_vf = vec_vf.fit_transform(X_train_vf)
X_test_vf = vec_vf.transform(X_test_vf)

In [105]:
print(X_vf.head())

0    jederzeit schnelle lieferung gutes preisleistu...
1                        schnelle lieferung no comment
2    bester service top qualität bestellvorgang unk...
3                  schnelligkeit ausgefallene produkte
4    super service extrem schnelle lieferung top qu...
Name: Text, dtype: object


In [None]:
# Modeltesting: Which model has the best accuracy?

models = {
    'Gradient Boosting': GradientBoostingClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train_vf, y_train_vf)
    y_pred = model.predict(X_test_vf)
    print(f'{name} Accuracy: {accuracy_score(y_test_vf, y_pred)}')
    print(classification_report(y_test_vf, y_pred))

Gradient Boosting Accuracy: 0.780195865070729
              precision    recall  f1-score   support

           1       0.75      0.82      0.78       966
           2       0.38      0.04      0.07       159
           3       0.80      0.10      0.18       192
           4       0.33      0.03      0.05       237
           5       0.80      0.96      0.87      2122

    accuracy                           0.78      3676
   macro avg       0.61      0.39      0.39      3676
weighted avg       0.74      0.78      0.73      3676

Random Forest Accuracy: 0.8686071817192601
              precision    recall  f1-score   support

           1       0.82      0.94      0.87       966
           2       0.97      0.47      0.64       159
           3       0.97      0.47      0.64       192
           4       0.92      0.25      0.40       237
           5       0.88      0.97      0.93      2122

    accuracy                           0.87      3676
   macro avg       0.91      0.62      0.6

In [None]:
# RandomForest deliver nearly the same accuracy of 0.86!

In [None]:
# instead of over- or undersampling I choosed the BalanceRandomForestClassifier. Perhaps we also could do a test with oversampling or SMOTE

from imblearn.ensemble import BalancedRandomForestClassifier

bclf = BalancedRandomForestClassifier()
bclf.fit(X_train_vf, y_train_vf) 
y_pred = bclf.predict(X_test_vf)
print(" Accuracy:", accuracy_score(y_test_vf, y_pred))
print(classification_report(y_test_vf, y_pred))



 Accuracy: 0.809031556039173
              precision    recall  f1-score   support

           1       0.78      0.91      0.84       966
           2       0.73      0.50      0.59       159
           3       0.65      0.58      0.62       192
           4       0.33      0.47      0.39       237
           5       0.92      0.84      0.88      2122

    accuracy                           0.81      3676
   macro avg       0.68      0.66      0.67      3676
weighted avg       0.83      0.81      0.81      3676



In [None]:
# The accuracy of these resampling-method is not so good as without resampling!
# accuracy - Randomforest : 0.868
# accuracy - BalancedRandomForestClassifier: 0.809

In [None]:
#Check if there is a influence of comments with invitation of the companies
# filter the dataframe for comments with value "No Invitation" in column Invitation

value = "No Invitation"
data_no_inv = data[data['Invitation']==value]
data_no_inv.reset_index(drop=True, inplace=True)

data_no_inv.head()

Unnamed: 0.1,Unnamed: 0,Company,Rating_number_customer,Stars,Invitation,Dates,Text
0,20,KICKZ,1,1,No Invitation,Aktualisiert vor 6 Tagen,anfragen per mail telefonisch paypal brachten ...
1,21,KICKZ,1,1,No Invitation,4. März 2025,schlechter kundenservice ende november widerru...
2,23,KICKZ,1,1,No Invitation,4. März 2025,möchte geld zurück möchte geld zurück warte sc...
3,24,KICKZ,1,1,No Invitation,3. März 2025,stornierungen einbehalten geld seit jahren tre...
4,25,KICKZ,1,1,No Invitation,3. März 2025,verstehe kickz passiert verstehe kickz passier...


In [None]:
# do Train_test_split and lemmatization with the same function like above



# Separate the explanatory variable from the variable to be predicted
X1, y1 = data_no_inv.Text, data_no_inv.Stars

# To perform the lemmatization, we separate the sentences and put them back together.
X1 = X1.str.split()

for i in range (0, len(X1)):
    X1[i] = lemmatization(X1[i])
    X1[i] = ' '.join(X1[i])


# Separate the dataset into training and test data 
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state = 42)

vec_vf = TfidfVectorizer()

# Update X_train_vf and X_test_vf values
X_train_1 = vec_vf.fit_transform(X_train_1)
X_test_1 = vec_vf.transform(X_test_1)



#BalancedRandomForest

bclf = BalancedRandomForestClassifier()
bclf.fit(X_train_1, y_train_1) 
y_pred = bclf.predict(X_test_1)
print(" Accuracy:", accuracy_score(y_test_1, y_pred))
print(classification_report(y_test_1, y_pred))

 Accuracy: 0.8260179799048123
              precision    recall  f1-score   support

           1       0.84      0.93      0.88       833
           2       0.74      0.54      0.62       110
           3       0.66      0.55      0.60       118
           4       0.55      0.43      0.48       119
           5       0.87      0.86      0.87       711

    accuracy                           0.83      1891
   macro avg       0.73      0.66      0.69      1891
weighted avg       0.82      0.83      0.82      1891



In [None]:
models = {'Random Forest': RandomForestClassifier()}

for name, model in models.items():
    model.fit(X_train_1, y_train_1)
    y_pred = model.predict(X_test_1)
    print(f'{name} Accuracy: {accuracy_score(y_test_1, y_pred)}')
    print(classification_report(y_test_1, y_pred))

Random Forest Accuracy: 0.8635642517186674
              precision    recall  f1-score   support

           1       0.85      0.97      0.91       833
           2       0.98      0.54      0.69       110
           3       0.98      0.47      0.64       118
           4       0.93      0.32      0.47       119
           5       0.86      0.94      0.90       711

    accuracy                           0.86      1891
   macro avg       0.92      0.65      0.72      1891
weighted avg       0.87      0.86      0.85      1891



In [None]:
#Accuracy of prediction for all comments in dataframe:
# accuracy - Randomforest : 0.868
# accuracy - BalancedRandomForestClassifier: 0.809


#Accuracy of prediction for comments without invitation:
# accuracy - Randomforest : 0.863
# accuracy - BalancedRandomForestClassifier: 0.826

#The result of the BalancedRandomForest-Model is a litte bit better for comments without invitation.