In [None]:

# importing required libraries
import numpy as np
import pandas as pd

In [None]:

# dataset doesn't have column names, so we have to provide it
columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

In [None]:

# importing dataset
data = pd.read_csv('KDDTrain+.txt',header=None, names=columns)

FileNotFoundError: [Errno 2] No such file or directory: 'KDDTrain+.txt'

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:

# remove attribute 'difficulty_level'
data.drop(['difficulty_level'],axis=1,inplace=True)
data.shape

In [None]:

# number of attack labels
data['label'].value_counts()

In [None]:
# changing attack labels to their respective attack class
def change_label(df):
    df['label'] = df['label'].replace(['apache2', 'back', 'land', 'neptune', 'mailbomb', 'pod',
                                       'processtable', 'smurf', 'teardrop', 'udpstorm', 'worm'], 'Dos')
    df['label'] = df['label'].replace(['ftp_write', 'guess_passwd', 'httptunnel', 'imap',
                                       'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack',
                                       'snmpguess', 'spy', 'warezclient', 'warezmaster',
                                       'xlock', 'xsnoop'], 'R2L')
    df['label'] = df['label'].replace(['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan'], 'Probe')
    df['label'] = df['label'].replace(['buffer_overflow', 'loadmodule', 'perl', 'ps', 'rootkit',
                                       'sqlattack', 'xterm'], 'U2R')

# calling change_label() function
change_label(data)


In [None]:
data.label.value_counts()

Data Normalization


In [None]:

# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [None]:
# selecting numeric attributes columns from data
numeric_col = data.select_dtypes(include='number').columns
# using standard scaler for normalizing
std_scaler = StandardScaler()
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
  return df



# calling the normalization() function
data = normalization(data.copy(),numeric_col)

In [None]:
# data after normalization
data.head()


In [None]:
# selecting categorical data attributes
cat_col = ['protocol_type','service','flag']
# creating a dataframe with only categorical attributes
categorical = data[cat_col]
categorical.head()


In [None]:

# one-hot-encoding categorical attributes using pandas.get_dummies() function
categorical = pd.get_dummies(categorical,columns=cat_col)
categorical.head()

binary classification

In [None]:
# changing attack labels into two categories 'normal' and 'abnormal'
bin_label = pd.DataFrame(data.label.map(lambda x:'normal' if x=='normal' else 'abnormal'))

In [None]:
# creating a dataframe with binary labels (normal,abnormal)
bin_data = data.copy()
bin_data['label'] = bin_label
le1 = preprocessing.LabelEncoder()
enc_label = bin_label.apply(le1.fit_transform)
bin_data['intrusion'] = enc_label
np.save("le1_classes.npy",le1.classes_,allow_pickle=True)

In [None]:
# dataset with binary labels and label encoded column
bin_data.head()

In [None]:
bin_data = pd.get_dummies(bin_data,columns=['label'],prefix="",prefix_sep="")
bin_data['label'] = bin_label
bin_data

In [None]:
import matplotlib.pyplot as plt

# pie chart distribution of normal and abnormal labels
plt.figure(figsize=(8,8))
plt.pie(bin_data.label.value_counts(),labels=bin_data.label.unique(),autopct='%0.2f%%')
plt.title("Pie chart distribution of normal and abnormal labels")
plt.legend()
# plt.savefig('plots/Pie_chart_binary.png')
plt.show()

multiclass classification

In [None]:
# Copying data and creating a multi-class label DataFrame
multi_data = data.copy()

# Creating a DataFrame for the 'label' column and dropping NaN values
multi_label = pd.DataFrame(multi_data['label']).dropna()

# Dropping corresponding rows from multi_data as well
multi_data = multi_data[multi_data['label'].notna()]

multi_label.head()


In [None]:
le2 = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
multi_data['intrusion'] = enc_label
print(le2.classes_)
np.save("le2_classes.npy",le2.classes_,allow_pickle=True)

In [None]:
# one-hot-encoding attack label
multi_data = pd.get_dummies(multi_data,columns=['label'],prefix="",prefix_sep="")
multi_data['label'] = multi_label
multi_data

In [None]:

# pie chart distribution of multi-class labels
plt.figure(figsize=(8,8))
plt.pie(multi_data.label.value_counts(),labels=multi_data.label.unique(),autopct='%0.2f%%')
plt.title('Pie chart distribution of multi-class labels')
plt.legend()
# plt.savefig('plots/Pie_chart_multi.png')
plt.show()


In [None]:

# creating a dataframe with only numeric attributes of binary class dataset and encoded label attribute
numeric_bin = bin_data[numeric_col]
numeric_bin['intrusion'] = bin_data['intrusion']


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr = numeric_bin.corr()

# Filter to get only features with correlation greater than 0.5 with 'intrusion'
high_corr_features = corr.index[corr['intrusion'].abs() > 0.5]
filtered_corr = corr.loc[high_corr_features, high_corr_features]

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(filtered_corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Heatmap of Features Correlated with Intrusion Label')
plt.show()


In [None]:
# selecting attributes found by using pearson correlation coefficient
numeric_bin = bin_data[['count','srv_serror_rate','serror_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
                         'logged_in','dst_host_same_srv_rate','dst_host_srv_count','same_srv_rate']]

In [None]:
# joining the selected attribute with the one-hot-encoded categorical dataframe
numeric_bin = numeric_bin.join(categorical)
# then joining encoded, one-hot-encoded, and original attack label attribute
bin_data = numeric_bin.join(bin_data[['intrusion','abnormal','normal','label']])

In [None]:
bin_data.to_csv("bin_data.csv")
# final dataset for binary classification
bin_data

In [None]:
# creating a dataframe with only numeric attributes of multi-class dataset and encoded label attribute
numeric_multi = multi_data[numeric_col]
numeric_multi['intrusion'] = multi_data['intrusion']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr = numeric_multi.corr()

# Filter features with an absolute correlation greater than 0.5 with 'intrusion'
high_corr_features = corr.index[corr['intrusion'].abs() > 0.5]
filtered_corr = corr.loc[high_corr_features, high_corr_features]

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(filtered_corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Heatmap of Attributes with High Correlation with Intrusion')
plt.show()


In [None]:
# selecting attributes found by using pearson correlation coefficient
numeric_multi = multi_data[['count','logged_in','srv_serror_rate','serror_rate','dst_host_serror_rate',
                        'dst_host_same_srv_rate','dst_host_srv_serror_rate','dst_host_srv_count','same_srv_rate']]

In [None]:
# joining the selected attribute with the one-hot-encoded categorical dataframe
numeric_multi = numeric_multi.join(categorical)
# then joining encoded, one-hot-encoded, and original attack label attribute
multi_data = numeric_multi.join(multi_data[['intrusion','Dos','Probe','R2L','U2R','normal','label']])

In [None]:
multi_data.info()

In [None]:

# saving final dataset to disk
multi_data.to_csv('multi_data.csv')

# final dataset for multi-class classification
multi_data

 Binary Models

In [None]:
import numpy as np
import pandas as pd

import pickle # saving and loading trained model
from os import path

# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# importing library for plotting
import matplotlib.pyplot as plt

# importing library for support vector machine classifier
from sklearn.svm import SVC
# importing library for K-neares-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
# importing library for Linear Discriminant Analysis Model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# importing library for Quadratic Discriminant Analysis Model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn import metrics
from sklearn.metrics import accuracy_score # for calculating accuracy of model
from sklearn.model_selection import train_test_split # for splitting the dataset for training and testing
from sklearn.metrics import classification_report # for generating a classification report of model

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

from keras.layers import Dense # importing dense layer
from keras.models import Sequential #importing Sequential layer
from keras.models import model_from_json # saving and loading trained model

from keras.layers import LSTM
from keras.layers import Input
from keras.models import Model

# representation of model layers
# from keras.utils.vis_utils import plot_model


SVM

In [None]:
# prompt: show me data with Nan values in bin_data and drop that row

# Find rows with NaN values in 'bin_data'
rows_with_nan = bin_data[bin_data.isnull().any(axis=1)]

# Print the rows with NaN values
print("Rows with NaN values:")
print(rows_with_nan)

# Drop rows with NaN values in 'bin_data'
bin_data.dropna(inplace=True)

# Verify that NaN values are removed
print("\nRows with NaN values after dropping:")
print(bin_data[bin_data.isnull().any(axis=1)])

In [None]:


X = bin_data.iloc[:,0:93].to_numpy() # dataset excluding target attribute (encoded, one-hot-encoded,original)
Y = bin_data['intrusion'] # target attribute


# splitting the dataset 75% for training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=42)

# using kernel as linear
lsvm = SVC(kernel='linear',gamma='auto')
lsvm.fit(X_train,y_train) # training model on training dataset

In [None]:
pkl_filename = "lsvm_binary.pkl"
if (not path.isfile(pkl_filename)):
  # saving the trained model to disk
  with open(pkl_filename, 'wb') as file:
    pickle.dump(lsvm, file)
  print("Saved model to disk")
  # loading the trained model from disk
  with open(pkl_filename, 'rb') as file:
    lsvm = pickle.load(file)
  print("Loaded model from disk")

In [None]:
y_pred = lsvm.predict(X_test) # predicting target attribute on testing dataset
ac = accuracy_score(y_test, y_pred)*100 # calculating accuracy of predicted data
print("LSVM-Classifier Binary Set-Accuracy is ", ac)


# classification report
print(classification_report(y_test, y_pred,target_names=le1.classes_))
print("Mean Absolute Error - " , metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error - " , metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error - " , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2 Score - " , metrics.explained_variance_score(y_test, y_pred)*100)
print("Accuracy - ",accuracy_score(y_test,y_pred)*100)


In [None]:
plt.figure(figsize=(20,8))
plt.plot(y_pred[300:500], label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[300:500].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.ylim((-1,2))
plt.title("Linear SVM Binary Classification")
# plt.savefig('plots/lsvm_real_pred_bin.png')
plt.show()

KNN

In [None]:
knn=KNeighborsClassifier(n_neighbors=5) # creating model for 5 neighbors
knn.fit(X_train,y_train) # training model on training dataset

pkl_filename = "knn_binary.pkl"
if (not path.isfile(pkl_filename)):
  # saving the trained model to disk
  with open(pkl_filename, 'wb') as file:
    pickle.dump(knn, file)
  print("Saved model to disk")
  # loading the trained model from disk
  with open(pkl_filename, 'rb') as file:
    knn = pickle.load(file)
  print("Loaded model from disk")
y_pred=knn.predict(X_test) # predicting target attribute on testing dataset
ac=accuracy_score(y_test, y_pred)*100 # calculating accuracy of predicted data
print("KNN-Classifier Binary Set-Accuracy is ", ac)
# classification report
print(classification_report(y_test, y_pred,target_names=le1.classes_))

print("Mean Absolute Error - " , metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error - " , metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error - " , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2 Score - " , metrics.explained_variance_score(y_test, y_pred)*100)
print("Accuracy - ",accuracy_score(y_test,y_pred)*100)

In [None]:


rows_with_nan = multi_data[multi_data.isnull().any(axis=1)]
multi_data.dropna(inplace=True)
X = multi_data.iloc[:,0:93].to_numpy() # dataset excluding target attribute (encoded, one-hot-encoded, original)
Y = multi_data['intrusion'] # target attribute

# splitting the dataset 75% for training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=42)

In [None]:
lsvm=SVC(kernel='linear',gamma='auto')
lsvm.fit(X_train,y_train) # training model on training dataset
# saving trained model to disk
pkl_filename = "./lsvm_multi.pkl"
if (not path.isfile(pkl_filename)):
  with open(pkl_filename, 'wb') as file:
    pickle.dump(lsvm, file)
  print("Saved model to disk")
  # loading trained model from disk
  with open(pkl_filename, 'rb') as file:
    lsvm = pickle.load(file)
  print("Loade model from disk")

y_pred=lsvm.predict(X_test) # predicting target attribute on testing dataset
ac=accuracy_score(y_test, y_pred)*100  # calculating accuracy of predicted data
print("LSVM-Classifier Multi-class Set-Accuracy is ", ac)

# classification report
print(classification_report(y_test, y_pred,target_names=le2.classes_))

print("Mean Absolute Error - " , metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error - " , metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error - " , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2 Score - " , metrics.explained_variance_score(y_test, y_pred)*100)
print("Accuracy - ",accuracy_score(y_test,y_pred)*100)

In [None]:

knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train) # training model on training dataset

pkl_filename = "./knn_multi.pkl"
if (not path.isfile(pkl_filename)):
  # saving trained model to disk
  with open(pkl_filename, 'wb') as file:
    pickle.dump(knn, file)
  print("Saved model to disk")
  # loading trained model from disk
  with open(pkl_filename, 'rb') as file:
    knn = pickle.load(file)
  print("Loaded model from disk")
y_pred=knn.predict(X_test)  # predicting target attribute on testing dataset
ac=accuracy_score(y_test, y_pred)*100  # calculating accuracy of predicted data
print("KNN-Classifier Multi-class Set-Accuracy is ", ac)

# classification report
print(classification_report(y_test, y_pred,target_names=le2.classes_))

print("Mean Absolute Error - " , metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error - " , metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error - " , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2 Score - " , metrics.explained_variance_score(y_test, y_pred)*100)
print("Accuracy - ",accuracy_score(y_test,y_pred)*100)

testing starts

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

# Load the models
with open("lsvm_binary.pkl", "rb") as file:
    binary_svm_model = pickle.load(file)

with open("knn_binary.pkl", "rb") as file:
    binary_knn_model = pickle.load(file)

with open("lsvm_multi.pkl", "rb") as file:
    multi_svm_model = pickle.load(file)

with open("knn_multi.pkl", "rb") as file:
    multi_knn_model = pickle.load(file)

# Load the LabelEncoders for decoding output
le1_classes = np.load("le1_classes.npy", allow_pickle=True)
le2_classes = np.load("le2_classes.npy", allow_pickle=True)



In [None]:
# prompt: can you help me build a ml pipeline that takes raw data like from KDDTrain+.txt and then processes it like we did above and then gives it to modle which we loaded earlier and then show the output , do this and only for binary classification

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import pickle

# Assuming you have your trained models (binary_svm_model, binary_knn_model, etc.) loaded as in your previous code.
# Also assuming you have le1_classes and le2_classes loaded.

def process_raw_data(raw_data_string, model_type):
  """
  Processes raw data string, prepares it for prediction, and returns the prediction.

  Args:
    raw_data_string: A string representing the raw data, like a line from KDDTrain+.txt
    model_type: "binary" or "multi" depending on the model to be used.

  Returns:
    A prediction (class label) based on the input raw data.
  """
  columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label",]

  # Convert raw data string to a list (split by commas)
  raw_data_list = raw_data_string.split(',')

  df_input = pd.DataFrame([raw_data_list])
  df_input.columns = columns # Assign columns after DataFrame creation

  # Change attack labels to classes (Dos, R2L, etc.)
  def change_label(df):
    df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
         'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
  change_label(df_input)

  # Normalize numeric features
  numeric_col = df_input.select_dtypes(include='number').columns
  std_scaler = StandardScaler()
  for i in numeric_col:
    arr = df_input[i]
    arr = np.array(arr)
    df_input[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))

  # One-hot encode categorical features
  cat_col = ['protocol_type','service','flag']
  df_categorical = pd.get_dummies(df_input[cat_col], columns=cat_col)

  # Prepare input for the chosen model type
  if model_type == "binary":
      bin_label = pd.DataFrame(df_input.label.map(lambda x:'normal' if x=='normal' else 'abnormal'))
      df_input['intrusion'] = bin_label.apply(preprocessing.LabelEncoder().fit_transform)
      df_input = df_input[['count','srv_serror_rate','serror_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
                         'logged_in','dst_host_same_srv_rate','dst_host_srv_count','same_srv_rate']]
      df_input = df_input.join(df_categorical)
      input_for_model = df_input.to_numpy()
      prediction = binary__model.predict(input_for_model) # Assuming binary_svm_model is your loaded model.
      predicted_class = le1_classes[prediction[0]]
      return predicted_class
  elif model_type == "multi":
      # Similar steps as for binary, but using multi-class model and label encoder.
      pass  # Add multi-class processing logic here.
  else:
      return "Invalid model type."

# Example usage
raw_data = "0,tcp,systat,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,239,20,1.00,1.00,0.00,0.00,0.08,0.07,0.00,255,20,0.08,0.08,0.00,0.00,1.00,1.00,0.00,0.00,anomaly"
prediction = process_raw_data(raw_data, "binary")
print("Prediction:", prediction)

In [None]:
def preprocess_input(raw_data, std_scaler, encoder):
    # Split the input string and create a DataFrame
    features = raw_data.split(',')

    # Ensure features match the expected columns in the bin_data.csv dataset
    feature_names = [
        'count', 'protocol_type', 'service', 'flag', 'srv_serror_rate', 'serror_rate',
        'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'logged_in',
        'dst_host_same_srv_rate', 'dst_host_srv_count', 'same_srv_rate',
        'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'diff_srv_rate', 'same_srv_rate',
        'intrusion', 'abnormal', 'normal'
    ]

    # Create DataFrame
    input_df = pd.DataFrame([features], columns=feature_names)

    # Convert appropriate columns to numeric
    numeric_columns = [
        'count', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'logged_in', 'dst_host_same_srv_rate',
        'dst_host_srv_count', 'same_srv_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'diff_srv_rate'
    ]

    # Convert numeric columns to numeric type
    input_df[numeric_columns] = input_df[numeric_columns].apply(pd.to_numeric, errors='coerce')

    # Normalize numeric data
    input_df[numeric_columns] = std_scaler.transform(input_df[numeric_columns])

    # One-hot encode categorical features
    categorical_features = ['protocol_type', 'service', 'flag']
    encoded_categorical = encoder.transform(input_df[categorical_features]).toarray()
    encoded_columns = encoder.get_feature_names_out(categorical_features)

    # Create a DataFrame for the encoded categorical features
    encoded_df = pd.DataFrame(encoded_categorical, columns=encoded_columns)

    # Combine the normalized numeric data with encoded categorical data
    processed_data = pd.concat([input_df[numeric_columns], encoded_df], axis=1)

    # Set placeholders for the label attributes; modify as needed
    processed_data['intrusion'] = 'normal'  # Placeholder; modify as needed
    processed_data['abnormal'] = 0  # Placeholder
    processed_data['normal'] = 1  # Placeholder
    processed_data['label'] = 'normal'  # Placeholder; modify as needed

    return processed_data

# Example input
raw_input = "0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229,10,0.00,0.00,1.00,1.00,0.04,0.06,0.00,255,10,0.04,0.06,0.00,0.00,0.00,0.00,1.00,1.00,neptune,21"
# Use your pre-fitted std_scaler and encoder here
processed_output = preprocess_input(raw_input, std_scaler, le1)
print(processed_output)