In [33]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
import re
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arseniskobelev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
df = pd.read_csv('./data.csv', sep=';')
df = df[pd.notnull(df['InputData'])]

# remove unused columns
# df = df.drop('RawInputData', axis=1)
df = df.drop('DockerImage', axis=1)
df = df.drop('DockerImageVersion', axis=1)
df = df.drop('KubernetesGracePeriodInSeconds', axis=1)
df = df.drop('KubernetesNamespace', axis=1)
df = df.drop('KubernetesObjectName', axis=1)
df = df.drop('Service', axis=1)
df = df.drop('Method', axis=1)
df = df.drop('KubernetesObjectType', axis=1)

print(df.tail(10))

FileNotFoundError: [Errno 2] No such file or directory: './data.csv'

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


def clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # delete stopwords from text
    return text


df['InputData'] = df['InputData'].apply(clean_text)
print(df.head(10))

In [None]:
X = df.InputData
y = df.Function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

model = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB()),
                  ])
model.fit(X_train, y_train)

# functions = df['Function'].unique()

# %%time
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
#
print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred, target_names=functions))

In [None]:
def predict_with_fallback(model_repr, input_data_param, threshold_param, fallback_value):
    # Predict probabilities for the input samples
    proba = model_repr.predict_proba(input_data_param)

    # Get the maximum predicted probability for each sample
    max_proba = np.max(proba, axis=1)
    print(max_proba)

    # Check if the maximum predicted probability is above the threshold
    if np.any(max_proba > threshold_param):
        # At least one sample has a predicted probability above the threshold
        # Return the predictions for those samples
        predictions = model_repr.predict(input_data_param)
        return predictions[0]

    else:
        return fallback_value


In [35]:
from datetime import datetime
import joblib

input_data = ["you delete a pod"]

threshold = .4

normal_prediction = model.predict(input_data)[0]
prediction_with_threshold = predict_with_fallback(
    model_repr=model,
    input_data_param=input_data,
    fallback_value="General.Output.UnknownCommand",
    threshold_param=threshold
)

print(f"Prediction with threshold: {prediction_with_threshold}")
print(f"Normal prediction: {normal_prediction}")

print("saving model...")
now = datetime.now()
formatted_date = now.strftime("%d-%m-%y")

joblib.dump(model, filename=f'./models/model-{formatted_date}.joblib')


[0.60602217]
Prediction with threshold: Kubernetes.Delete.Pod
Normal prediction: Kubernetes.Delete.Pod
saving model...


['./models/model-27-05-23.joblib']