<a href="https://colab.research.google.com/github/ArturBudniak/Graph_TopoFilter/blob/main/Graph_TopoFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
pip install networkx==2.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting networkx==2.3
  Downloading networkx-2.3.zip (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: networkx
  Building wheel for networkx (setup.py) ... [?25l[?25hdone
  Created wheel for networkx: filename=networkx-2.3-py2.py3-none-any.whl size=1556009 sha256=7b669fe4bc202f1fd8e7c68e1679748ef4c97c43f0b3c8c6c5ad2840bafbb292
  Stored in directory: /root/.cache/pip/wheels/ff/62/9e/0ed2d25fd4f5761e2d19568cda0c32716556dfa682e65ecf64
Successfully built networkx
Installing collected packages: networkx
  Attempting uninstall: networkx
    Found existing installation: networkx 3.0
    Uninstalling networkx-3.0:
      Successfully uninstalled networkx-3.0
Successfully installed networkx-2.3


In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [3]:
# Code to read csv file from Drive in Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
from sklearn.model_selection import train_test_split
import math

In [5]:
!pip install tensorflow-hub
!pip install tensorflow-datasets

# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import time
from random import randint
from sklearn.utils import shuffle

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.neighbors import kneighbors_graph
from statistics import mean 

# Functions

In [9]:
# predict top n most probable components with statistics

def predict_top_n_components (text, model_name, n, failure_ratio):

  y_proba = model_name.predict([text])

  if failure_ratio == True:
    for i in range(0,number_of_labels):
      y_proba[0][i] = y_proba[0][i] * components_failure_rate["Rate"].loc[y_test_set.columns[i]]


  df_top_n_components = pd.DataFrame(y_proba[0], index =df_cleaned.columns[-15:], columns =['probability'])


  return(df_top_n_components.sort_values(["probability"], ascending=False)[:n])#.index.tolist())

In [10]:
# create a new neural network for text classification

def define_model_NLP(number_of_1st_hidden_layer_nodes, number_of_2nd_hidden_layer_nodes):
  embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
  hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=False)

  model = tf.keras.Sequential()
  model.add(hub_layer)
  model.add(tf.keras.layers.Dense(number_of_1st_hidden_layer_nodes, activation='relu'))
  model.add(tf.keras.layers.Dense(number_of_2nd_hidden_layer_nodes, activation='relu'))   
  model.add(tf.keras.layers.Dense(number_of_labels, activation = 'softmax')) 

  model.compile(optimizer='adam', 
              loss=tf.keras.losses.CategoricalCrossentropy(),
               )
  return(model)

In [11]:
# execute one epoch training of the neural network

def train_model_NLP (model,data_set):
  # split dataset into training and testing sets
  train_set, test_set = train_test_split(data_set, test_size=0.2, random_state=42)
  # split into X (features) and y (labels)
  X_train_set = train_set["Customer Requested Comment"]
  y_train_set = train_set[train_set.columns[-number_of_labels:]]
  X_test_set = test_set["Customer Requested Comment"]
  y_test_set = test_set[test_set.columns[-number_of_labels:]]
  
  history = model.fit(X_train_set, y_train_set,
                    epochs=1,
                    batch_size=512,
                    validation_data=(X_test_set, y_test_set),
                    )
  
  return(model)

In [12]:
# get the hidden representation of points (one before last layer of the neural network)

def get_latent_features(dataset, n):                                            # n=0 -> all layers, n=-1 -> without last layer, etc
  model_without_last_layer = tf.keras.Sequential()
  if n==0:
      for layer in model.layers[:]:
        model_without_last_layer.add(layer)
  else:
      for layer in model.layers[:n]:
        model_without_last_layer.add(layer)
    
  model_without_last_layer.compile(optimizer='adam', loss='categorical_crossentropy');

  return(model_without_last_layer.predict(dataset["Customer Requested Comment"]))

In [13]:
# construct kNN graph over hidden represenation of data points (taken from TopoFilter)

def construct_G():

  kNN_matrix = kneighbors_graph(x, k, mode='connectivity', include_self=False)

  # create graph from kNN matrix
  G = nx.from_scipy_sparse_matrix(kNN_matrix)

  # assign attributes (labels) to nodes
  label = S["Global Component Code Description"]
  label = label.to_list()

  for node in range(0,len(G)):
    G.nodes[node]['label'] = label[node]

  return(G)

In [14]:
# for points belonging to class i create a subgraph of G with points only from that class; cut edges between class i points and other classes points (TopoFilter)

def construct_Gi():
  # remove nodes not belonging to given class i
  selected_label = list_of_labels[i]
  selected_data = dict( (n,d['label']) for n,d in G.nodes().items() if d['label'] == selected_label)
  
  #print(selected_data)
  to_be_removed = G.nodes - list(selected_data.keys())
  to_be_removed = list(to_be_removed)
  Gi = G.copy()
  for node in to_be_removed:
    Gi.remove_node(node)
  
  return (Gi)


In [15]:
# for points beloning to class i create a subgraph of G with points only from that class, cut edges between [class i and its neighbor points (according to knowledge graph KG)] and other classes points 

def construct_Gi_k_hops(k):

  # remove nodes not belonging to given class or its k-hops neighbors
  selected_label = []
  # root node:
  selected_label.append(list_of_labels[i])

  for node in appliance_G.nodes:
  
    target = node
    source = selected_label[0]
    
    if nx.has_path(appliance_G, source, target):
      distance = nx.shortest_path_length(appliance_G, source=source, target=target)
      # if the distance < k hops, then add to the list of k-hops neighbors
      if distance <= k:
        #selected_label.append(source)
        selected_label.append(target)
  
  selected_data = dict( (n,d['label']) for n,d in G.nodes().items() if d['label'] in selected_label)

  to_be_removed = G.nodes - list(selected_data.keys())
  to_be_removed = list(to_be_removed)
  Gi = G.copy()
  for node in to_be_removed:
    Gi.remove_node(node)

  return (Gi)

In [16]:
# select points of Gi or Gi_k_hops that belong to the largest connectoed component (samples expected to be clean)

def construct_largest_connected_component():
  
  if len(Gi) > 0:
    largest_cc = max(nx.connected_components(Gi), key=len)

    # remove nodes not belonging to largest connected component
    to_be_removed = Gi.nodes - largest_cc
    to_be_removed = list(to_be_removed)
    Qi = Gi.copy()
    for node in to_be_removed:
      Qi.remove_node(node)
    
    #remove nodes from other classes (if k-hop neighbor Gi)
    to_be_removed = []
    for node in Qi:
      if Qi.nodes[node]['label'] != list_of_labels[i]:
        to_be_removed.append(node)
    for node in to_be_removed:
      Qi.remove_node(node)

    return(Qi)
  else:
    Qi = nx.Graph()
    return(Qi)

In [17]:
# add new samples (from lcc) to the clean data subset C

def update_C():
  if len(Qi) > 0:
    C_new = pd.DataFrame(S.iloc[list(Qi.nodes)])
  else:
    C_new=[]
  
  return(C_new)

# Data load

In [18]:
# download service calls data set from Google Drive, not revealed

downloaded = drive.CreateFile({'id':"1Nf4qVI0hWFP0CQSck_68GdR_QfWMYZQN"})
downloaded.GetContentFile('filename.csv')
df = pd.read_csv('filename.csv')

In [19]:
df[["Customer Requested Comment", "Global Component Code Description"]].sample(5)

Unnamed: 0,Customer Requested Comment,Global Component Code Description
500623,GAS RANGE OVEN DOOR FELL OFF,FASTENERS/CLIPS/SCREWS/BOLTS (9030)
755512,CASH ON DELIVERY_OVEN NOT IGNITING,"IGNITER ASSEMBLY (DRYER, OVEN) (5120)"
703461,THE CONVECTION FAN NOT WORKING AND SHUTS OVEN OFF,PERFORMANCE COMPLAINT (8220)
49907,UNIT WONT IGNITE ON BAKE.,"IGNITER ASSEMBLY (DRYER, OVEN) (5120)"
828860,OVEN NOT HEATING,CONTROL BOARD / SENSOR BOARD (5210)


# Data cleaning

In [20]:
# make a copy of dataframe read from csv file
df_cleaned = df.copy()

In [21]:
# remove rows with missing values
df_cleaned.dropna(inplace=True)

In [22]:
# reshuffle
df_cleaned = shuffle(df_cleaned)

In [23]:
# change to lowercase
df_cleaned["Customer Requested Comment"] = df_cleaned["Customer Requested Comment"].str.lower()
df_cleaned["Service Technician Comment"] = df_cleaned["Service Technician Comment"].str.lower()

In [24]:
# remove special characters
df_cleaned["Customer Requested Comment"].replace(to_replace = '_',value = ' ',regex=True, inplace=True)

In [25]:
# choose a model of the home appliace
df_cleaned = df_cleaned[df_cleaned["DPL Platform Code"] == "WO"]

In [26]:
# remove multiple repairs assigned to the same ID number
number_of_multiple_repairs = 0
threshold = 2
my_list = df_cleaned["Service Claim Id"].value_counts()
for i in my_list:
  if i >= threshold:
    number_of_multiple_repairs = number_of_multiple_repairs + 1

# remove all duplicates
df_cleaned.drop_duplicates(subset=['Service Claim Id'], inplace = True, keep = False)

In [27]:
# delete labels that have less than for example 100 rows

threshold = 100  # Remove items less than or equal to threshold
vc = df_cleaned["Global Component Code Description"].value_counts()
vals_to_remove = vc[vc <= threshold].index.values

df_cleaned.drop(df_cleaned.loc[df_cleaned["Global Component Code Description"].isin(vals_to_remove) ].index, inplace=True)

In [28]:
#remove labels which cannot be represented as components in graph representing the whole product

list_of_labels_to_be_removed = ["INSTRUCTION FOR USE (9910)", "CUSTOMER REFUSED REPAIR (9965)","MISC. EXTERNAL PARTS/HARDWARE (8370)","GENERAL SAFETY QUESTIONS (9975)", "CONNECTIVITY CUSTOMER INSTRUCT (8350)", "PERFORMANCE COMPLAINT (8220)"]

for i in list_of_labels_to_be_removed:
  df_cleaned.drop(df_cleaned[df_cleaned["Global Component Code Description"] == i].index, inplace=True)

In [29]:
# number of classes

list_of_components = df_cleaned["Global Component Code Description"].value_counts()
i=0
for w in list_of_components.index:
  i=i+1

number_of_labels = i
print("number of labels ",number_of_labels)

list_of_labels = df_cleaned["Global Component Code Description"].unique()

number of labels  46


In [30]:
# select only two columns X = Customer Requested Comment and Y = Global Component Code Description

df_cleaned = df_cleaned[["Customer Requested Comment", "Global Component Code Description"]]

In [31]:
# encode categorical Y (Components)
df_cleaned = pd.concat([df_cleaned, pd.get_dummies(df_cleaned["Global Component Code Description"])], axis=1)

In [32]:
# assign labels to position in one-hot encoding

values =  df_cleaned.columns.values[2:]
keys = list(range(0, len(values)))
dict_labels = dict(zip(keys, values))


# Graph representation

In [33]:
#read knowledge graph from file; the graph shows the home appliance construction (relationships between classes, not revealed)
#https://drive.google.com/file/d/1Na4-HK78Zb5b9cJ1ME6TcgU1Jd4JcgB2/view?usp=share_link graphml

downloaded = drive.CreateFile({'id':"1Na4-HK78Zb5b9cJ1ME6TcgU1Jd4JcgB2"})
downloaded.GetContentFile('appliance.graphml')
appliance_G = nx.read_graphml("appliance.graphml")

#Validate

In [None]:
list_of_manual_files = ["11s1YfHiXmBGH8PQPwkznG9MPSV_4iolt","1QgsbMb_U47R5Jn3YcQaBwkxbKWRYjvJt","1yA6LPtrTIFKN9rDDug_Ur_dPRUwYcVoT","1IMvwyxBZAdk4LHDd1_dqqDFnHXJCcT3r", "1iWMdB8LIl1AkTYrioHvyCAUS5K-GyrwW", "1b6A4cZb8fFz3m61t7vgzXkswg0Qx56bU" ] #"1R33d6JencbhfsgABTrE76gDM-dnCM_he",
list_of_manual_components = ["HOOK/DOOR-LATCH (1310)", "LAMP (6520)","HINGES (1880)", "THERMOSTAT (5550)", "MOTOR-FAN, CIRCULATION (4400)", "PRODUCT FUSE (5630)"]

#a part of the dataset for the evaluation
fraction = 0.2
print("fraction of the data set ",fraction)

m = 15
N = 20
gamma = number_of_labels
k = 10
#zeta = 0.0

#graph directed or undirected
appliance_G = appliance_G.to_undirected()


for k_hops in range(0,3):

  ####################################################
  #perform the TopoFilter / Graph TopoFilter algorithm
  print("#########")
  print("k_hops ",k_hops)
  S = shuffle(df_cleaned)
  S=S.sample(frac = fraction)
  S.reset_index(drop=True, inplace=True)
  model = define_model_NLP(200,200)

  #3. Initialize
  C = pd.DataFrame(columns=["Customer Requested Comment", "Global Component Code Description"])
  S_hat = S

  #4.
  for t in range(1,N):

    #5. Train network on S_hat
    model = train_model_NLP(model, S_hat)

    #6.
    if t >= m:
      #7. Extract feature vecotr x from training data S
      x = get_latent_features(S,-1)
      y = S["Global Component Code Description"]
      

      #8. Compute k-NN graph G over x
      G = construct_G()

      #9.
      for i in range (0,gamma):
        Gi = construct_Gi_k_hops(k_hops)

        #11. Compute the largest connected componet
        Qi = construct_largest_connected_component()

        #12. C ← C U Qi
        C_add = update_C()
        C=C.append(C_add)
        
      #13. end_for

      #14. Find outliers O within C based on ζ-filtering; update C ← C\O
      #C = remove_outliers(C)

      #15. S_hat ← C
      S_hat = C
    
    #16. end_if

  #17. end_for  

  ###################################################
  #validation for 6 files that were manually verified
  list_of_F1_scores = []
  list_of_sizes_of_clean_data_sets = []

  for file_number in range (0,len(list_of_manual_files)):
    
    downloaded = drive.CreateFile({'id':list_of_manual_files[file_number]})
    downloaded.GetContentFile('clean_data_set.csv')
    clean_data_set = pd.read_csv('clean_data_set.csv')

    #remove duplicates from clean data set 
    clean_data_set.drop_duplicates(subset = ["Customer Requested Comment"], inplace = True, keep = "first") 

    #remove rows that are not present in S (when fraction of S is set to be <1)
    i = 0
    S_subset = S[S["Global Component Code Description"]==list_of_manual_components[file_number]]
    while i < len(clean_data_set):
      not_found = True
      for j in range (0, len(S_subset)):
        if clean_data_set["Customer Requested Comment"].iloc[i] == S_subset["Customer Requested Comment"].iloc[j]:
          not_found = False
      if not_found:
        clean_data_set.drop(clean_data_set.iloc[i].name, inplace=True)
        #print("usunieto wiersz ",i)
      else:
        i = i + 1

    #select from C only given components
    C_component = C.copy()
    C_component = C_component[C_component["Global Component Code Description"] == list_of_manual_components[file_number]]
    C_component.drop_duplicates(subset = ["Customer Requested Comment"], inplace = True, keep = "first")
    list_of_sizes_of_clean_data_sets.append(len(clean_data_set))

    print("     ", list_of_manual_components[file_number])

    #calculate confusion matrix

    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0

    column_with_confusion_matrix_values = []

    for i in range(0, len(clean_data_set)):
      not_found = True
      for j in range (0, len(C_component)):
        if clean_data_set["Customer Requested Comment"].iloc[i] == C_component["Customer Requested Comment"].iloc[j]:
          not_found = False
          if clean_data_set["true label"].iloc[i] == 1:
            true_positive = true_positive + 1
            column_with_confusion_matrix_values.append("TP")
          else:
            false_positive = false_positive + 1
            column_with_confusion_matrix_values.append("FP")
            #print("FP: ",clean_data_set["Customer Requested Comment"].iloc[i])

      if not_found == True:
        if clean_data_set["true label"].iloc[i] == 1:
          false_negative = false_negative + 1
          column_with_confusion_matrix_values.append("FN")
          #print("FN: ",clean_data_set["Customer Requested Comment"].iloc[i])
        else:
          true_negative = true_negative + 1
          column_with_confusion_matrix_values.append("TN")


    print("       True positive: ", true_positive)
    print("       False positive: ", false_positive)
    print("       True negative: ", true_negative)
    print("       False negative: ", false_negative)

    list_of_F1_scores.append(true_positive / (true_positive + 0.5*(false_positive + false_negative)))
    print("       F1 score: ",round(list_of_F1_scores[file_number],2))
  
  #weighted F1 score for 6 components
  F1_score_weighted = np.average(list_of_F1_scores, weights=list_of_sizes_of_clean_data_sets)
  print("weighted F1 score: ",round(F1_score_weighted,2))
      