In [298]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arseniskobelev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [299]:
df = pd.read_csv('./data.csv', sep=';')
df = df[pd.notnull(df['InputData'])]

# remove unused columns
# df = df.drop('RawInputData', axis=1)
df = df.drop('DockerImage', axis=1)
df = df.drop('DockerImageVersion', axis=1)
df = df.drop('KubernetesGracePeriodInSeconds', axis=1)
df = df.drop('KubernetesNamespace', axis=1)
df = df.drop('KubernetesObjectName', axis=1)
df = df.drop('Service', axis=1)
df = df.drop('Method', axis=1)
df = df.drop('KubernetesObjectType', axis=1)

print(df.tail(10))

                                InputData                            Function
320                Please create a volume  Kubernetes.Create.PersistentVolume
321                       Create a volume  Kubernetes.Create.PersistentVolume
322              Please delete deployment        Kubernetes.Delete.Deployment
323              Delete deployment please        Kubernetes.Delete.Deployment
324  Could you please delete a deployment        Kubernetes.Delete.Deployment
325     I need you to delete a deployment        Kubernetes.Delete.Deployment
326        Could you create a pod for me?               Kubernetes.Create.Pod
327               Could you create a pod?               Kubernetes.Create.Pod
328                      I need a new pod               Kubernetes.Create.Pod
329               I need a new deployment        Kubernetes.Create.Deployment


In [300]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

df['InputData'] = df['InputData'].apply(clean_text)
print(df.head(10))

                          InputData               Function
0             create kubernetes pod  Kubernetes.Create.Pod
1       could create kubernetes pod  Kubernetes.Create.Pod
2      want create pod inkubernetes  Kubernetes.Create.Pod
3                   want create pod  Kubernetes.Create.Pod
4                        create pod  Kubernetes.Create.Pod
5             pod kubernetes create  Kubernetes.Create.Pod
6             pod create kubernetes  Kubernetes.Create.Pod
7                new kubernetes pod  Kubernetes.Create.Pod
8                new pod kubernetes  Kubernetes.Create.Pod
9  please create new pod kubernetes  Kubernetes.Create.Pod


In [301]:
X = df.InputData
y = df.Function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [302]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

model = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB()),
                  ])
model.fit(X_train, y_train)

functions = df['Function'].unique()

# %%time
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
#
print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred, target_names=functions))

accuracy 0.8484848484848485


In [303]:
def predict_with_fallback(model_repr, input_data_param, threshold_param, fallback_value):
    # Predict probabilities for the input samples
    proba = model_repr.predict_proba(input_data_param)

    # Get the maximum predicted probability for each sample
    max_proba = np.max(proba, axis=1)
    print(max_proba)

    # Check if the maximum predicted probability is above the threshold
    if np.any(max_proba > threshold_param):
        # At least one sample has a predicted probability above the threshold
        # Return the predictions for those samples
        predictions = model_repr.predict(input_data_param)
        return predictions[0]

    else:
        return fallback_value


In [306]:
input_data = ["I need a new persistent volume"]

threshold = .4

normal_prediction = model.predict(input_data)[0]
prediction_with_threshold = predict_with_fallback(
    model_repr=model,
    input_data_param=input_data,
    fallback_value="General.Output.UnknownCommand",
    threshold_param=threshold
)

print(f"Prediction with threshold: {prediction_with_threshold}")
print(f"Normal prediction: {normal_prediction}")



[0.4968405]
Prediction with threshold: Kubernetes.Create.PersistentVolume
Normal prediction: Kubernetes.Create.PersistentVolume
