# Common Preprocessing (Shared by Both Approaches)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load dataset and drop unnecessary columns
df = pd.read_csv("AppGallery.csv")
cols_to_drop = ["Ticket id", "Interaction id", "Unnamed: 11", "Interaction date"]
df = df.drop(columns=cols_to_drop, errors='ignore')

In [4]:
df.isnull().sum()

Mailbox                    0
Ticket Summary             1
Interaction content        2
Innso TYPOLOGY_TICKET      0
Type 1                     0
Type 2                     0
Type 3                    33
Type 4                    35
dtype: int64

In [5]:
# Drop rows missing values 
df = df.dropna()

In [6]:
df.isnull().sum()

Mailbox                   0
Ticket Summary            0
Interaction content       0
Innso TYPOLOGY_TICKET     0
Type 1                    0
Type 2                    0
Type 3                    0
Type 4                    0
dtype: int64

In [7]:
# Check for expected columns and combine text features
expected_columns = ["Ticket Summary", "Interaction content", "Type 2", "Type 3", "Type 4"]
for col in expected_columns:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found. Please verify your CSV structure.")

# Combine "Ticket Summary" and "Interaction content" into a single text feature.
df["text"] = df["Ticket Summary"].astype(str) + " " + df["Interaction content"].astype(str)

In [8]:
# Encode target labels for Type2, Type3, and Type4.
le_type2 = LabelEncoder()
le_type3 = LabelEncoder()
le_type4 = LabelEncoder()
df["Type2_enc"] = le_type2.fit_transform(df["Type 2"])
df["Type3_enc"] = le_type3.fit_transform(df["Type 3"])
df["Type4_enc"] = le_type4.fit_transform(df["Type 4"])

In [9]:
# Define features and targets.
X = df["text"]
y = df[["Type2_enc", "Type3_enc", "Type4_enc"]]

In [10]:
# Split into train and test sets.
RANDOM_STATE = 42
TEST_SIZE = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [11]:
# Convert text to numeric features.
tfidf = TfidfVectorizer()
X_train_vect = tfidf.fit_transform(X_train)
X_test_vect = tfidf.transform(X_test)

# 1. Multi-Output Classification Using ClassifierChain

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# Define the base estimator and the ClassifierChain.
base_estimator = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
chain = ClassifierChain(base_estimator, order=[0, 1, 2], random_state=RANDOM_STATE)

In [15]:
# Train the model.
chain.fit(X_train_vect, y_train)

In [16]:
# Predict on test data.
y_pred_chain = chain.predict(X_test_vect).astype(int)

In [17]:
# Evaluate individual accuracies.
acc_type2 = accuracy_score(y_test.iloc[:, 0], y_pred_chain[:, 0])
acc_type3 = accuracy_score(y_test.iloc[:, 1], y_pred_chain[:, 1])
acc_type4 = accuracy_score(y_test.iloc[:, 2], y_pred_chain[:, 2])

print("ClassifierChain Evaluation:")
print(f"Accuracy for Type2: {acc_type2*100:.2f}%")
print(f"Accuracy for Type3: {acc_type3*100:.2f}%")
print(f"Accuracy for Type4: {acc_type4*100:.2f}%")

ClassifierChain Evaluation:
Accuracy for Type2: 100.00%
Accuracy for Type3: 88.89%
Accuracy for Type4: 77.78%


In [18]:
# Print classification reports.
print("\nClassification Report for Type2:")
print(classification_report(y_test.iloc[:, 0], y_pred_chain[:, 0]))
print("Classification Report for Type3:")
print(classification_report(y_test.iloc[:, 1], y_pred_chain[:, 1]))
print("Classification Report for Type4:")
print(classification_report(y_test.iloc[:, 2], y_pred_chain[:, 2]))


Classification Report for Type2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         3

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

Classification Report for Type3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.75      1.00      0.86         3
           2       0.50      1.00      0.67         1
           3       0.00      0.00      0.00         2
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         2

    accuracy                           0.89        18
   macro avg       0.71      0.83      0.75        18
weighted avg       0.82      0.89      0.85        18

Classification Report for Type4:
              precision    rec

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Optionally convert predictions back to original labels.
y_pred_chain_labels = pd.DataFrame({
    "Type2": le_type2.inverse_transform(y_pred_chain[:, 0]),
    "Type3": le_type3.inverse_transform(y_pred_chain[:, 1]),
    "Type4": le_type4.inverse_transform(y_pred_chain[:, 2])
})
print("\nPredicted Labels (ClassifierChain):")
y_pred_chain_labels


Predicted Labels (ClassifierChain):


Unnamed: 0,Type2,Type3,Type4
0,Problem/Fault,Coupon/Gifts/Points Issues,Can't use or acquire
1,Problem/Fault,Gallery-Install/Upgrade,Other download/install/update issue
2,Problem/Fault,Third Party APPs,Refund
3,Problem/Fault,Coupon/Gifts/Points Issues,Cooperated campaign issue
4,Problem/Fault,Gallery-Install/Upgrade,Other download/install/update issue
5,Suggestion,VIP / Offers / Promotions,Offers / Vouchers / Promotions
6,Problem/Fault,Gallery-Install/Upgrade,Other download/install/update issue
7,Problem/Fault,Gallery-Install/Upgrade,Other download/install/update issue
8,Suggestion,VIP / Offers / Promotions,Offers / Vouchers / Promotions
9,Problem/Fault,Third Party APPs,Refund


# 2. Hierarchical Modelling Approach

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [22]:
# ----- Hierarchical Model for Type2 -----
clf_type2 = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
clf_type2.fit(X_train_vect, y_train["Type2_enc"])

In [23]:
# ----- Hierarchical Model for Type3 -----
# Create a dictionary to store Type3 classifiers for each Type2 class.
models_type3 = {}
for cls in np.unique(y_train["Type2_enc"]):
    idx = y_train["Type2_enc"] == cls
    if idx.sum() > 0:
        model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
        model.fit(X_train_vect[idx], y_train.loc[idx, "Type3_enc"])
        models_type3[cls] = model

In [24]:
# ----- Hierarchical Model for Type4 -----
# Create a dictionary to store Type4 classifiers for each Type3 class.
models_type4 = {}
for cls in np.unique(y_train["Type3_enc"]):
    idx = y_train["Type3_enc"] == cls
    if idx.sum() > 0:
        model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
        model.fit(X_train_vect[idx], y_train.loc[idx, "Type4_enc"])
        models_type4[cls] = model

In [25]:
# ----- Predictions with Hierarchical Models -----
# Step 1: Predict Type2 for test samples.
y_pred_type2 = clf_type2.predict(X_test_vect)

In [26]:
# Step 2: Predict Type3 based on predicted Type2.
y_pred_type3 = []
for i, pred_type2 in enumerate(y_pred_type2):
    # Use the corresponding model for Type3; if not found, assign a default value (-1).
    model = models_type3.get(pred_type2, None)
    if model:
        # Extract the scalar prediction using [0]
        y_pred_type3.append(model.predict(X_test_vect[i])[0])
    else:
        y_pred_type3.append(-1)
y_pred_type3 = np.array(y_pred_type3)

In [27]:
# Step 3: Predict Type4 based on predicted Type3.
y_pred_type4 = []
for i, pred_type3 in enumerate(y_pred_type3):
    model = models_type4.get(pred_type3, None)
    if model:
        y_pred_type4.append(model.predict(X_test_vect[i])[0])
    else:
        y_pred_type4.append(-1)
y_pred_type4 = np.array(y_pred_type4)

In [28]:
# Combine predictions into one array.
y_pred_hier = np.column_stack((y_pred_type2, y_pred_type3, y_pred_type4)).astype(int)

In [29]:
# ----- Evaluation for Hierarchical Modelling -----
acc_type2_hier = accuracy_score(y_test.iloc[:, 0], y_pred_hier[:, 0])
acc_type3_hier = accuracy_score(y_test.iloc[:, 1], y_pred_hier[:, 1])
acc_type4_hier = accuracy_score(y_test.iloc[:, 2], y_pred_hier[:, 2])

print("\nHierarchical Modelling Evaluation:")
print(f"Accuracy for Type2: {acc_type2_hier*100:.2f}%")
print(f"Accuracy for Type3: {acc_type3_hier*100:.2f}%")
print(f"Accuracy for Type4: {acc_type4_hier*100:.2f}%")


Hierarchical Modelling Evaluation:
Accuracy for Type2: 100.00%
Accuracy for Type3: 88.89%
Accuracy for Type4: 83.33%


In [30]:
print("\nClassification Report for Hierarchical Type2:")
print(classification_report(y_test.iloc[:, 0], y_pred_hier[:, 0]))
print("Classification Report for Hierarchical Type3:")
print(classification_report(y_test.iloc[:, 1], y_pred_hier[:, 1]))
print("Classification Report for Hierarchical Type4:")
print(classification_report(y_test.iloc[:, 2], y_pred_hier[:, 2]))


Classification Report for Hierarchical Type2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         3

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18

Classification Report for Hierarchical Type3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.75      1.00      0.86         3
           2       0.50      1.00      0.67         1
           3       0.00      0.00      0.00         2
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         2

    accuracy                           0.89        18
   macro avg       0.71      0.83      0.75        18
weighted avg       0.82      0.89      0.85        18

Classification Report for Hierarchica

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# Optionally convert hierarchical predictions back to original labels.
y_pred_hier_labels = pd.DataFrame({
    "Type2": le_type2.inverse_transform(y_pred_hier[:, 0]),
    "Type3": le_type3.inverse_transform(y_pred_hier[:, 1]),
    "Type4": le_type4.inverse_transform(y_pred_hier[:, 2])
})
print("\nPredicted Labels (Hierarchical Modelling):")
y_pred_hier_labels


Predicted Labels (Hierarchical Modelling):


Unnamed: 0,Type2,Type3,Type4
0,Problem/Fault,Coupon/Gifts/Points Issues,Can't use or acquire
1,Problem/Fault,Gallery-Install/Upgrade,Can't install Apps
2,Problem/Fault,Third Party APPs,Refund
3,Problem/Fault,Coupon/Gifts/Points Issues,Cooperated campaign issue
4,Problem/Fault,Gallery-Install/Upgrade,Other download/install/update issue
5,Suggestion,VIP / Offers / Promotions,Offers / Vouchers / Promotions
6,Problem/Fault,Gallery-Install/Upgrade,Can't install Apps
7,Problem/Fault,Gallery-Install/Upgrade,Can't install Apps
8,Suggestion,VIP / Offers / Promotions,Offers / Vouchers / Promotions
9,Problem/Fault,Third Party APPs,Refund
