In [7]:
# 1. Data Preparation
import pandas as pd
import json

# Function to load JSON file into a dictionary
def load_json_as_dict(json_file_path):
    with open(json_file_path, mode='r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data

# Path to the JSON file
json_file_path = "nmap_commands_1.json"

# Load the JSON file as a dictionary
nmap_dataset = load_json_as_dict(json_file_path)

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(list(nmap_dataset.items()), columns=['Command', 'Description'])

# Check data types
print(df.dtypes)

# Convert 'Description' to string if it's not already
df['Description'] = df['Description'].apply(lambda x: json.dumps(x) if isinstance(x, dict) else str(x))

# Check data after conversion
print(df.head())

Command        object
Description    object
dtype: object
                                 Command  \
0                    -iL <inputfilename>   
1                        -iR <num hosts>   
2  --exclude <host1[,host2][,host3],...>   
3           --excludefile <exclude_file>   
4                                    -sL   

                                         Description  
0  {"Description": "Input from list of hosts/netw...  
1  {"Description": "Choose random targets", "Deta...  
2  {"Description": "Exclude hosts/networks", "Det...  
3  {"Description": "Exclude list from file", "Det...  
4  {"Description": "List Scan - simply list targe...  


In [9]:
# 2. Text Preprocessing

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic tokens, and lemmatize the tokens
    cleaned_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(cleaned_tokens)

# Apply preprocessing to the 'Description' column
df['Processed_Description'] = df['Description'].apply(preprocess_text)

# Check the resulting DataFrame
print(df.head())

                                 Command  \
0                    -iL <inputfilename>   
1                        -iR <num hosts>   
2  --exclude <host1[,host2][,host3],...>   
3           --excludefile <exclude_file>   
4                                    -sL   

                                         Description  \
0  {"Description": "Input from list of hosts/netw...   
1  {"Description": "Choose random targets", "Deta...   
2  {"Description": "Exclude hosts/networks", "Det...   
3  {"Description": "Exclude list from file", "Det...   
4  {"Description": "List Scan - simply list targe...   

                               Processed_Description  
0  description input list detail specify file con...  
1  description choose random target detail select...  
2  description exclude detail exclude specific ho...  
3  description exclude list file detail exclude h...  
4  description list scan simply list target scan ...  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# 3. Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the processed descriptions
X = vectorizer.fit_transform(df['Processed_Description'])

print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1149 stored elements and shape (116, 347)>
  Coords	Values
  (0, 63)	0.08122997999862391
  (0, 140)	0.37882101923746075
  (0, 161)	0.598296557266506
  (0, 64)	0.08122997999862391
  (0, 294)	0.17047023423337693
  (0, 98)	0.32251668762447383
  (0, 47)	0.37882101923746075
  (0, 125)	0.26621235601148696
  (0, 178)	0.32251668762447383
  (0, 262)	0.1840400341892543
  (1, 63)	0.06716434804092647
  (1, 64)	0.06716434804092647
  (1, 33)	0.31322507751096756
  (1, 230)	0.6264501550219351
  (1, 312)	0.5578316150562505
  (1, 269)	0.29390309867168113
  (1, 292)	0.17356074207482236
  (1, 186)	0.2147395042772139
  (1, 264)	0.16320731216415893
  (2, 63)	0.08856967879868427
  (2, 64)	0.08856967879868427
  (2, 125)	0.29026651077075016
  (2, 178)	0.3516583339131581
  (2, 262)	0.20066934295093097
  (2, 87)	0.775140496604744
  :	:
  (112, 46)	0.19511677353623427
  (112, 139)	0.6087393647505877
  (113, 63)	0.061188526342934736
  (113, 64)	0.061188

In [36]:
# 4. Model Training

from sklearn.linear_model import LogisticRegression

try:
    # Initialize the model
    model = LogisticRegression()

    # Train the model
    model.fit(X, df['Command'])

    # If the model trains successfully, print a success message
    print("Model training successful.")

except Exception as e:
    # If an error occurs, print the error message
    print(f"An error occurred: {e}")

Model training successful.


In [38]:
# 5. Prediction

def predict_nmap_command(query):
    # Preprocess the query
    processed_query = preprocess_text(query)
    # Transform the query using the same vectorizer
    query_vector = vectorizer.transform([processed_query])
    # Predict the command
    predicted_command = model.predict(query_vector)
    return predicted_command[0]

# Example usage

# Define a function to log user interactions (placeholder implementation)
def log_data(user_query, predicted_output, feedback):
    print(f"Logged: Query - {user_query}, Predicted Output - {predicted_output}, Feedback - {feedback}")

# Example pseudocode for the feedback loop

while True:
    user_query = input("Please enter your query (type 'exit' to stop): ")
    
    if user_query.lower() == 'exit':
        print("Exiting...")
        break
    
    # Use your NLP model to predict an Nmap command based on user_query
    predicted_output = predict_nmap_command(user_query)
    
    print(f"Predicted Nmap command: {predicted_output}")
    
    feedback = input("Was this output correct? Type 'yes', 'no', or 'close': ")
    
    # Process feedback
    if feedback == 'yes':
        # Reinforce correct prediction in your model
        try:
            model.fit(X, df['Command'])
        except Exception as e:
            print(f"An error occurred during reinforcement: {e}")
            
    elif feedback == 'no':
        # Update model to avoid similar incorrect predictions
        try:
            model.fit(X, df['Command'])
        except Exception as e:
            print(f"An error occurred during adjustment: {e}")
            
    elif feedback == 'close':
        # Fine-tune parameters or adjust training data
        try:
            model.fit(X, df['Command'])
        except Exception as e:
            print(f"An error occurred during fine-tuning: {e}")
    
    # Log user query, predicted output, and feedback
    log_data(user_query, predicted_output, feedback)

print("Program terminated.")

Please enter your query (type 'exit' to stop):  discover hosts


Predicted Nmap command: -Pn


Was this output correct? Type 'yes', 'no', or 'close':  yes


Logged: Query - discover hosts, Predicted Output - -Pn, Feedback - yes


Please enter your query (type 'exit' to stop):  exit


Exiting...
Program terminated.


In [34]:
# 6. Model Evaluation

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Command'], test_size=0.2, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, zero_division=1))

                                               precision    recall  f1-score   support

                          --data <hex string>       0.00      1.00      0.00       0.0
                          --disable-keepalive       0.00      1.00      0.00       0.0
            --dns-servers <serv1[,serv2],...>       1.00      0.00      0.00       1.0
                 --excludefile <exclude_file>       0.00      1.00      0.00       0.0
                           --follow-redirects       1.00      0.00      0.00       1.0
                        --host-timeout <time>       0.00      1.00      0.00       0.0
         --http-auth-cred <username:password>       1.00      0.00      0.00       1.0
         --http2-max-concurrent-streams <num>       1.00      0.00      0.00       1.0
                --http2-max-frame-size <size>       1.00      0.00      0.00       1.0
          --http2-max-header-list-size <size>       0.00      1.00      0.00       0.0
                       --ip-options <optio