In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display

In [48]:
# Load the data

df = pd.read_csv("CustomerFeedback_reviews.csv", delimiter=";", encoding="latin1", on_bad_lines='skip')
print(df.columns)
df.head()


Index(['Review Number', 'App Name', 'App Store', 'App', 'Store', 'App ID',
       'Country', 'Version', 'Rating', 'Date', 'Author', 'Subject',
       'Review ID', 'Body', 'Translated Subject', 'Translated Body',
       'Sentiment', 'Device', 'Language', 'OS Version', 'Reply URL', 'Topics',
       'Custom Topics', 'Tags', 'Label1', 'Label2', 'Label3', 'Label4',
       'Product Category 1', 'Product Category 2', 'Product Category 3',
       'Product', 'Product Code', 'Error Code', 'Notes', 'HC function',
       'Unnamed: 36', 'Dish Care', 'Laundry Care', 'Cooking', 'PCP', 'Cooling',
       'Review-ID from AIM', 'Unnamed: 43', 'Unnamed: 44'],
      dtype='object')


  df = pd.read_csv("CustomerFeedback_reviews.csv", delimiter=";", encoding="latin1", on_bad_lines='skip')


Unnamed: 0,Review Number,App Name,App Store,App,Store,App ID,Country,Version,Rating,Date,...,HC function,Unnamed: 36,Dish Care,Laundry Care,Cooking,PCP,Cooling,Review-ID from AIM,Unnamed: 43,Unnamed: 44
0,1,Home Connect,Google Play,Home Connect,Google Play,com.bshg.homeconnect.android.release,English,7.1.0,1,01.01.2021,...,,,,,,,,,,
1,2,Home Connect,Google Play,Home Connect,Google Play,com.bshg.homeconnect.android.release,Polish,7.1.0,2,01.01.2021,...,,,,,,,,,,
2,3,Home Connect,Google Play,Home Connect,Google Play,com.bshg.homeconnect.android.release,English,7.1.0,1,01.01.2021,...,,,,,,,,,,
3,4,Home Connect,Google Play,Home Connect,Google Play,com.bshg.homeconnect.android.release,German,,1,01.01.2021,...,,,,,,,,,,
4,5,Home Connect App,iOS,Home Connect App,iOS,901397789,United Arab Emirates,7.1.0,5,01.01.2021,...,,,,,,,,,,


In [49]:
df.columns

Index(['Review Number', 'App Name', 'App Store', 'App', 'Store', 'App ID',
       'Country', 'Version', 'Rating', 'Date', 'Author', 'Subject',
       'Review ID', 'Body', 'Translated Subject', 'Translated Body',
       'Sentiment', 'Device', 'Language', 'OS Version', 'Reply URL', 'Topics',
       'Custom Topics', 'Tags', 'Label1', 'Label2', 'Label3', 'Label4',
       'Product Category 1', 'Product Category 2', 'Product Category 3',
       'Product', 'Product Code', 'Error Code', 'Notes', 'HC function',
       'Unnamed: 36', 'Dish Care', 'Laundry Care', 'Cooking', 'PCP', 'Cooling',
       'Review-ID from AIM', 'Unnamed: 43', 'Unnamed: 44'],
      dtype='object')

In [50]:

# Select relevant columns (this will drop any missing ones silently if needed)
relevant_columns = [
    'Rating', 'Date', 'Body', 'Translated Subject', 'Subject', 'Translated Body',
    'Sentiment', 'Custom Topics', 'Label1', 'Label2', 'Label3', 'Label4',
    'Product Category 1', 'Product Category 2', 'Product Category 3',
    'Product', 'Product Code', 'Error Code', 'HC function'
]

# Keep only columns that exist in the data (in case of unexpected column names)
df = df[[col for col in relevant_columns if col in df.columns]]

# Optional: show a quick peek
df.head()


Unnamed: 0,Rating,Date,Body,Translated Subject,Subject,Translated Body,Sentiment,Custom Topics,Label1,Label2,Label3,Label4,Product Category 1,Product Category 2,Product Category 3,Product,Product Code,Error Code,HC function
0,1,01.01.2021,AVOID LIKE THE PLAGUE. Should have been zero s...,,,,negative,"[""Connection & errors""]",Sporadic connection losses,,,,Laundry Care,,,,,,
1,2,01.01.2021,Bardzo s?abo si? ??czy z telefonem. U mnie WFI...,,,It connects very poorly with the phone. For me...,negative,[],Connection speed,,,,,,,,,,
2,1,01.01.2021,Difficult to add appliance. Restarting app aft...,,,,negative,[],App reliability,Pairing,,,Laundry Care,,,Washer,,,
3,1,01.01.2021,Lsst sich nicht mit home connect verbinden (F...,,,Cannot be connected to home connect (Fold 2 / ...,negative,"[""Connection & errors""]",Pairing,,,,,,,,,,
4,5,01.01.2021,Please contact me at Skype/WhatsApp: +86189035...,,Do you wanna raise downloads and revenue over ...,,mixed,[],Spam,,,,,,,,,,


In [51]:
df.columns


Index(['Rating', 'Date', 'Body', 'Translated Subject', 'Subject',
       'Translated Body', 'Sentiment', 'Custom Topics', 'Label1', 'Label2',
       'Label3', 'Label4', 'Product Category 1', 'Product Category 2',
       'Product Category 3', 'Product', 'Product Code', 'Error Code',
       'HC function'],
      dtype='object')

In [52]:
# Standardize text columns
for col in ['Label1', 'Label2', 'Label3', 'Label4', 'Product', 'HC function', 'Product Category 1']:
    df[col] = df[col].astype(str).str.lower().str.strip()


In [53]:
# Handle missing values 
df = df.dropna(subset=['Translated Body', 'Product Category 1'])  # Drop rows with essential missing data

In [54]:
# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")


In [55]:
def one_hot_merge(df, col_name):
    encoded = encoder.fit_transform(df[[col_name]])
    df = pd.concat([df, encoded], axis=1)
    return df.drop(columns=[col_name])

In [56]:
df = one_hot_merge(df, 'Product Category 1')
df = one_hot_merge(df, 'Label1')
df = one_hot_merge(df, 'Product')
df = one_hot_merge(df, 'HC function')


In [60]:
# Define features (X) and targets (y) 
# Use all columns except the target one-hot columns for X
target_cols = [col for col in df.columns if col.startswith("product category 1_")]
label_cols = [col for col in df.columns if col.startswith("label1_")]
product_cols = [col for col in df.columns if col.startswith("product_")]
hc_cols = [col for col in df.columns if col.startswith("hc function_")]

y = df[target_cols + label_cols + product_cols + hc_cols]
X = df.drop(columns=target_cols + label_cols + product_cols + hc_cols)

In [61]:
# Optional: check remaining columns in X 
display(X.head(3))
print("Predictor columns:", X.columns.tolist())
print("Target columns:", y.columns.tolist())

Unnamed: 0,Rating,Date,Body,Translated Subject,Subject,Translated Body,Sentiment,Custom Topics,Label2,Label3,...,HC function_notification center,HC function_program assistance,HC function_program assistant,HC function_program favorites,HC function_singlekeyid,HC function_smart start,HC function_smart watch,HC function_tab counter,HC function_user manuals,HC function_widget
1,2,01.01.2021,Bardzo s?abo si? ??czy z telefonem. U mnie WFI...,,,It connects very poorly with the phone. For me...,negative,[],,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,01.01.2021,Lsst sich nicht mit home connect verbinden (F...,,,Cannot be connected to home connect (Fold 2 / ...,negative,"[""Connection & errors""]",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,01.01.2021,Verbindung zum Gert funktioniert nicht immer....,,,Connection to the device does not always work....,negative,"[""Connection & errors""]",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Predictor columns: ['Rating', 'Date', 'Body', 'Translated Subject', 'Subject', 'Translated Body', 'Sentiment', 'Custom Topics', 'Label2', 'Label3', 'Label4', 'Product Category 2', 'Product Category 3', 'Product Code', 'Error Code', 'Product Category 1_consumer products', 'Product Category 1_cooking', 'Product Category 1_cooling', 'Product Category 1_dish care', 'Product Category 1_laundry care', 'Product Category 1_nan', 'Product Category 1_pcp', 'Label1_3rd party', 'Label1_app availability', 'Label1_app compatibility', 'Label1_app crash', 'Label1_app functions', 'Label1_app installation', 'Label1_app performance', 'Label1_app reliability', 'Label1_app start', 'Label1_barrier free use', 'Label1_connection losses after update', 'Label1_connection other', 'Label1_connection speed', 'Label1_customer service', 'Label1_device functions', 'Label1_feature shutdown', 'Label1_firmware update', 'Label1_improvement ideas', 'Label1_language', 'Label1_login', 'Label1_nan', 'Label1_pairing', 'Label1

In [62]:
# Train/Valid/Test Split 
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)