In [None]:
pip install -U scikit-learn
pip install stanza
pip install transformers
pip install sentencepiece

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
from sklearn.model_selection import train_test_split
import stanza
from stanza.pipeline.core import DownloadMethod
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer


class Configurations:
    TICKET_SUMMARY_COL = 'Ticket Summary'
    INTERACTION_CONTENT_COL = 'Interaction content'
    TYPE_COLUMNS = ['y2', 'y3', 'y4']
    CLASS_COLUMN = 'y2'
    GROUPED_COL = 'y1'


def load_dataset():
    dataframe1 = pd.read_csv("C:/Users/user/Desktop/Engineering and Evaluating Artificial Intelligence/AppGallery.csv", skipinitialspace=True)
    dataframe1.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    dataframe2 = pd.read_csv("C:/Users/user/Desktop/Engineering and Evaluating Artificial Intelligence/Purchasing.csv", skipinitialspace=True)
    dataframe2.rename(columns={'Type 1': 'y1', 'Type 2': 'y2', 'Type 3': 'y3', 'Type 4': 'y4'}, inplace=True)
    merged_dataframe = pd.concat([dataframe1, dataframe2])
    merged_dataframe[Configurations.INTERACTION_CONTENT_COL] = merged_dataframe[Configurations.INTERACTION_CONTENT_COL].values.astype('U')
    merged_dataframe[Configurations.TICKET_SUMMARY_COL] = merged_dataframe[Configurations.TICKET_SUMMARY_COL].values.astype('U')
    merged_dataframe["y"] = merged_dataframe[Configurations.CLASS_COLUMN]
    merged_dataframe = merged_dataframe.loc[(merged_dataframe["y"] != '') & (~merged_dataframe["y"].isna())]
    return merged_dataframe


def remove_duplicates(dataset: pd.DataFrame):
    dataset["deduplicated_content"] = ""

    customer_support_template = {
        "english": [
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) Customer Support team\,?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE is a company incorporated under the laws of Ireland with its headquarters in Dublin, Ireland\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE is the provider of Huawei Mobile Services to Huawei and Honor device owners in (?:Europe|\*\*\*\*\*\(LOC\)), Canada, Australia, New Zealand and other countries\.?"
        ],
        "german": [
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) Kundenservice\,?",
            "Die (?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE ist eine Gesellschaft nach irischem Recht mit Sitz in Dublin, Irland\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE ist der Anbieter von Huawei Mobile Services für Huawei- und Honor-Gerätebesitzer in Europa, Kanada, Australien, Neuseeland und anderen Ländern\.?"
        ],
        "french": [
            "L'équipe d'assistance à la clientèle d'Aspiegel\,?",
            "Die (?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE est une société de droit irlandais dont le siège est à Dublin, en Irlande\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE est le fournisseur de services mobiles Huawei aux propriétaires d'appareils Huawei et Honor en Europe, au Canada, en Australie, en Nouvelle-Zélande et dans d'autres pays\.?"
        ],
        "spanish": [
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) Soporte Servicio al Cliente\,?",
            "Die (?:Aspiegel|\*\*\*\*\*\(PERSON\)) es una sociedad constituida en virtud de la legislación de Irlanda con su sede en Dublín, Irlanda\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE es el proveedor de servicios móviles de Huawei a los propietarios de dispositivos de Huawei y Honor en Europa, Canadá, Australia, Nueva Zelanda y otros países\.?"
        ],
        "italian": [
            "Il tuo team ad (?:Aspiegel|\*\*\*\*\*\(PERSON\)),?",
            "Die (?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE è una società costituita secondo le leggi irlandesi con sede a Dublino, Irlanda\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE è il fornitore di servizi mobili Huawei per i proprietari di dispositivi Huawei e Honor in Europa, Canada, Australia, Nuova Zelanda e altri paesi\.?"
        ],
        "portuguese": [
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) Customer Support team,?",
            "Die (?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE é uma empresa constituída segundo as leis da Irlanda, com sede em Dublin, Irlanda\.?",
            "(?:Aspiegel|\*\*\*\*\*\(PERSON\)) SE é o provedor de Huawei Mobile Services para Huawei e Honor proprietários de dispositivos na Europa, Canadá, Austrália, Nova Zelândia e outros países\.?"
        ],
    }

    customer_support_pattern = "|".join(sum(list(customer_support_template.values()), []))

    email_pattern = "(From\s?:\s?xxxxx@xxxx.com Sent\s?:.{30,70}Subject\s?:)"
    wrote_pattern = "(On.{30,60}wrote:)"
    reply_pattern = "(Re\s?:|RE\s?:)"
    support_issue_pattern = "(\*\*\*\*\*\(PERSON\) Support issue submit)"
    phone_pattern = "(\s?\*\*\*\*\*\(PHONE\))*$"

    splitting_pattern = f"{email_pattern}|{wrote_pattern}|{reply_pattern}|{support_issue_pattern}|{phone_pattern}"

    ticket_counts = dataset["Ticket id"].value_counts()

    for ticket_id in ticket_counts.index:
        ticket_dataframe = dataset.loc[dataset['Ticket id'] == ticket_id]

        unique_content_set = set()
        deduplicated_content = []
        for content in ticket_dataframe[Configurations.INTERACTION_CONTENT_COL]:
            content_segments = re.split(splitting_pattern, content)
            content_segments = [segment for segment in content_segments if segment is not None]
            content_segments = [re.sub(splitting_pattern, "", segment.strip()) for segment in content_segments]
            content_segments = [re.sub(customer_support_pattern, "", segment.strip()) for segment in content_segments]

            current_content = []
            for segment in content_segments:
                if segment and segment not in unique_content_set:
                    unique_content_set.add(segment)
                    current_content.append(segment + "\n")

            deduplicated_content.append(' '.join(current_content))
        dataset.loc[dataset["Ticket id"] == ticket_id, "deduplicated_content"] = deduplicated_content

    dataset[Configurations.INTERACTION_CONTENT_COL] = dataset['deduplicated_content']
    dataset = dataset.drop(columns=['deduplicated_content'])
    return dataset


def remove_noise(dataframe: pd.DataFrame):
    noise_pattern = "(sv\s*:)|(wg\s*:)|(ynt\s*:)|(fw(d)?\s*:)|(r\s*:)|(re\s*:)|([|])|(aspiegel support issue submit)|(null)|(nan)|((bonus place my )?support.pt 自动回复:)"
    dataframe[Configurations.TICKET_SUMMARY_COL] = (
        dataframe[Configurations.TICKET_SUMMARY_COL]
        .str.lower()
        .replace(noise_pattern, " ", regex=True)
        .replace(r'\s+', ' ', regex=True)
        .str.strip()
    )
    dataframe[Configurations.INTERACTION_CONTENT_COL] = dataframe[Configurations.INTERACTION_CONTENT_COL].str.lower()

    noise_patterns = [
    "(from :)|(subject :)|(sent :)|(r\\s*:)|(re\\s*:)",
    "(january|february|march|april|may|june|july|august|september|october|november|december)",
    "(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)",
    "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)",
    "\\d{2}(:|.)\\d{2}",
    "(xxxxx@xxxx\\.com)|(\\*{5}([a-z]+))",
    "dear ((customer)|(user))",
    "dear",
    "(hello)|(hallo)|(hi )|(hi there)",
    "good morning",
    "thank you for your patience ((during (our)? investigation)|(and cooperation))?",
    "thank you for contacting us",
    "thank you for your availability",
    "thank you for providing us this information",
    "thank you for contacting",
    "thank you for reaching us (back)?",
    "thank you for patience",
    "thank you for (your)? reply",
    "thank you for (your)? response",
    "thank you for (your)? cooperation",
    "thank you for providing us with more information",
    "thank you very kindly",
    "thank you( very much)?",
    "i would like to follow up on the case you raised on the date",
    "i will do my very best to assist you",
    "in order to give you the best solution",
    "could you please clarify your request with following information:",
    "in this matter",
    "we hope you(( are)|('re)) doing ((fine)|(well))",
    "i would like to follow up on the case you raised on",
    "we apologize for the inconvenience",
    "sent from my huawei (cell )?phone",
    "original message",
    "customer support team",
    "(aspiegel )?se is a company incorporated under the laws of ireland with its headquarters in dublin, ireland.",
    "(aspiegel )?se is the provider of huawei mobile services to huawei and honor device owners in",
    "canada, australia, new zealand and other countries",
    "\\d+",
    "[^0-9a-zA-Z]+",
    "(\\s|^).{1}(\\s|$)"
]

    for noise_pattern in noise_patterns:
        dataframe[Configurations.INTERACTION_CONTENT_COL] = dataframe[Configurations.INTERACTION_CONTENT_COL].replace(noise_pattern, " ", regex=True)
    dataframe[Configurations.INTERACTION_CONTENT_COL] = dataframe[Configurations.INTERACTION_CONTENT_COL].replace(r'\s+', ' ', regex=True).str.strip()

    frequent_y1 = dataframe.y1.value_counts()[dataframe.y1.value_counts() > 10].index
    dataframe = dataframe.loc[dataframe.y1.isin(frequent_y1)]
    return dataframe


def translate_text_to_english(text_list: list[str]):
    translation_model = "facebook/m2m100_418M"
    translation_pipeline = pipeline(task='text2text-generation', model=translation_model)
    translation_model_instance = M2M100ForConditionalGeneration.from_pretrained(translation_model)
    translation_tokenizer = M2M100Tokenizer.from_pretrained(translation_model)

    language_detector = stanza.Pipeline(lang="multilingual", processors="langid", download_method=DownloadMethod.REUSE_RESOURCES)

    translated_text_list = []
    for text in text_list:
        if not text:
            translated_text_list.append(text)
            continue

        detected_lang = language_detector(text)
        if detected_lang.lang == "en":
            translated_text_list.append(text)
        else:
            lang = detected_lang.lang
            if lang == "fro":
                lang = "fr"
            elif lang == "la":
                lang = "it"
            elif lang == "nn":
                lang = "no"
            elif lang == "kmr":
                lang = "tr"

            translation_tokenizer.src_lang = lang
            encoded_text = translation_tokenizer(text, return_tensors="pt")
            generated_tokens = translation_model_instance.generate(**encoded_text, forced_bos_token_id=translation_tokenizer.get_lang_id("en"))
            translated_text = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            translated_text_list.append(translated_text[0])
    return translated_text_list


def generate_tfidf_embeddings(dataframe: pd.DataFrame):
    tfidf_vectorizer = TfidfVectorizer(max_features=2000, min_df=4, max_df=0.90)
    text_data = dataframe[Configurations.TICKET_SUMMARY_COL] + ' ' + dataframe[Configurations.INTERACTION_CONTENT_COL]
    embeddings = tfidf_vectorizer.fit_transform(text_data).toarray()
    return embeddings


class Dataset:
    def __init__(self, embeddings: np.ndarray, dataframe: pd.DataFrame):
        labels = dataframe["y"].to_numpy()
        label_series = pd.Series(labels)
        frequent_labels = label_series.value_counts()[label_series.value_counts() >= 3].index

        if len(frequent_labels) < 1:
            print("None of the classes have more than 3 records: Skipping ...")
            self.train_embeddings = None
            return

        filtered_labels = labels[label_series.isin(frequent_labels)]
        filtered_embeddings = embeddings[label_series.isin(frequent_labels)]
        new_test_size = embeddings.shape[0] * 0.2 / filtered_embeddings.shape[0]
        (
            self.train_embeddings,
            self.test_embeddings,
            self.train_labels,
            self.test_labels
        ) = train_test_split(filtered_embeddings, filtered_labels, test_size=new_test_size, random_state=0, stratify=filtered_labels)
        self.labels = filtered_labels
        self.unique_classes = frequent_labels
        self.embeddings = embeddings

    def get_labels(self):
        return self.labels

    def get_train_embeddings(self):
        return self.train_embeddings

    def get_test_embeddings(self):
        return self.test_embeddings

    def get_train_labels(self):
        return self.train_labels

    def get_test_labels(self):
        return self.test_labels

    def get_embeddings(self):
        return self.embeddings


class RandomForestModel:
    def __init__(self, model_name, embeddings):
        self.model_name = model_name
        self.embeddings = embeddings
        self.model = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced_subsample')
        self.predictions = None

    def train(self, dataset):
        train_embeddings, train_labels = dataset.get_train_embeddings(), dataset.get_train_labels()
        self.model.fit(train_embeddings, train_labels)

    def predict(self, test_embeddings: np.ndarray):
        self.predictions = self.model.predict(test_embeddings)

    def print_results(self, dataset):
        print(classification_report(dataset.get_test_labels(), self.predictions))

    def data_transform(self):
        pass


def model_training_and_prediction(dataset, dataframe, model_name):
    print("RandomForest")
    model = RandomForestModel("RandomForest", dataset.get_embeddings())
    model.train(dataset)
    model.predict(dataset.get_test_embeddings())
    model.print_results(dataset)


def main():
    dataframe = load_dataset()
    dataframe = remove_duplicates(dataframe)
    dataframe = remove_noise(dataframe)
    dataframe[Configurations.TICKET_SUMMARY_COL] = translate_text_to_english(dataframe[Configurations.TICKET_SUMMARY_COL].tolist())
    dataframe[Configurations.INTERACTION_CONTENT_COL] = translate_text_to_english(dataframe[Configurations.INTERACTION_CONTENT_COL].tolist())

    grouped_dataframe = dataframe.groupby(Configurations.GROUPED_COL)
    for group_name, group_dataframe in grouped_dataframe:
        print(group_name)
        embeddings = generate_tfidf_embeddings(group_dataframe)
        dataset = Dataset(embeddings, group_dataframe)

        if dataset.train_embeddings is None:
            continue

        # Chained Multi-outputs
        print("Chained Multi-outputs:")
        for combination in [["y2"], ["y2", "y3"], ["y2", "y3", "y4"]]:
            print(f"Combination: {combination}")
            model = RandomForestModel("RandomForest", dataset.get_embeddings())
            model.train(Dataset(embeddings, group_dataframe[combination]))
            model.predict(dataset.get_test_embeddings())
            model.print_results(Dataset(embeddings, group_dataframe[combination]))

        # Hierarchical Modeling
        print("Hierarchical Modeling:")
        y2_classes = group_dataframe["y2"].unique()
        for y2_class in y2_classes:
            print(f"y2 Class: {y2_class}")
            filtered_dataframe = group_dataframe[group_dataframe["y2"] == y2_class]
            filtered_embeddings = generate_tfidf_embeddings(filtered_dataframe)
            filtered_dataset = Dataset(filtered_embeddings, filtered_dataframe)

            if filtered_dataset.train_embeddings is None:
                continue

            y3_classes = filtered_dataframe["y3"].unique()
            for y3_class in y3_classes:
                print(f"y3 Class: {y3_class}")
                model = RandomForestModel("RandomForest", filtered_dataset.get_embeddings())
                model.train(Dataset(filtered_embeddings, filtered_dataframe[filtered_dataframe["y3"] == y3_class]))
                model.predict(filtered_dataset.get_test_embeddings())
                model.print_results(Dataset(filtered_embeddings, filtered_dataframe[filtered_dataframe["y3"] == y3_class]))


if __name__ == "__main__":
    main()

  return self.fget.__get__(instance, owner)()
2024-04-26 17:36:41 INFO: Loading these models for language: multilingual ():
| Processor | Package |
-----------------------
| langid    | ud      |

2024-04-26 17:36:41 INFO: Using device: cpu
2024-04-26 17:36:41 INFO: Loading: langid
2024-04-26 17:36:42 INFO: Done loading processors!


KeyboardInterrupt: 