In [None]:
%pip install numpy pandas seaborn matplotlib scikit-learn lightgbm xgboost tabulate

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('Train_data.csv')
df 

In [None]:
df.head()

In [None]:
df.tail()


In [None]:
#df. shape returns a tuple representing the dimensionality of the Data Frame.
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.min()

In [None]:
df.max()

In [None]:
df.duplicated().sum()

In [None]:
df.count()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(5, 3))
sns.countplot(x=df['class'])

In [None]:
# Encode categorical variables in features
X = df.drop('class', axis=1)  # Features (independent variables)
Y = df['class']  # Target variable (dependent variable)

In [None]:

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=['protocol_type', 'service', 'flag'], drop_first=True)

In [None]:
# Encode the target variable (ensure correct mapping)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)

In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)


In [None]:
# Initializing all the three models 
knn = KNeighborsClassifier(n_neighbors=5)
logreg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(random_state=42)

# Fitting all the above models 
knn.fit(X_train, y_train)
logreg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)

# Predicting the target variable for all the above models
knn_pred = knn.predict(X_test)
logreg_pred = logreg.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test)

# Calculating the accuracy of all the above models 
knn_accuracy = accuracy_score(y_test, knn_pred)
logreg_accuracy = accuracy_score(y_test, logreg_pred)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)

In [None]:
# No we are printing the accuracy of all the models
print(f"KNN Accuracy: {knn_accuracy}")
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Decision Tree Accuracy: {decision_tree_accuracy}")

In [None]:
# Showing accuracy of all the model in bar graph
models = ['KNN', 'Logistic Regression', 'Decision Tree']
accuracy = [knn_accuracy, logreg_accuracy, decision_tree_accuracy]

# No plotting the bar graph
plt.figure(figsize=(5, 3))
plt.bar(models, accuracy, color=['blue', 'red', 'green'])
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy of different models')
plt.ylim(0.7, 1)
plt.show()

In [None]:
# Here we will use the Decision Tree model to predict the target variable because it has the highest accuracy

# Initialize the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Fit the model with the training data
model.fit(X_train, y_train)

# predict the target on the training dataset
predictions = model.predict(X_train)

# Accuracy Score on the training dataset
accuracy = accuracy_score(y_train, predictions)
print(f"Accuracy score on training dataset: {accuracy}")
print(classification_report(y_train, predictions))


# Predict the target on the test dataset
predictions = model.predict(X_test)

# Accuracy Score on the test dataset
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy score on test dataset: {accuracy}")
print(classification_report(y_test, predictions))

In [None]:
# Example: Predict the class for a new input
def predict_user_input():
    # Collect user input for all features used during training
    duration = int(input("Enter the duration: "))
    protocol_type = str(input("Enter the protocol type: "))
    service = str(input("Enter the service: "))#0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0   1.) normal 
    flag = str(input("Enter the flag: "))#0,tcp,remote_job,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,     10.) anomaly
    src_bytes = int(input("Enter the source bytes: "))#0,tcp,ftp_data,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,258,17,1,1,0,0,0.07,0.05,0,255,5,0.02,0.07,0,0,1,1,0,0,anomaly
    dst_bytes = int(input("Enter the destination bytes: "))# 0,tcp,finger,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,12,1,1,0,0,0.5,0.08,0,255,59,0.23,0.04,0,0,1,1,0,0,anomaly    60.)
    land = int(input("Enter the land: "))# 38,tcp,ftp,SF,1441,4152,0,0,0,30,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,166,27,0.16,0.02,0.01,0,0.01,0,0,0,normal    18904.)
    wrong_fragment = int(input("Enter the wrong fragment: "))
    urgent = int(input("Enter the urgent: "))
    hot = int(input("Enter the hot: "))
    num_failed_logins = int(input("Enter the number of failed logins: "))
    logged_in = int(input("Enter the logged in: "))
    num_compromised = int(input("Enter the number of compromised: ")) # 0,icmp,eco_i,SF,20,0,0,0,0,0,0,0,0,0,0,
    # Include other features if necessary


    # Create a DataFrame for the input data    
    input_data = pd.DataFrame({
        'duration': [duration],
        'protocol_type': [protocol_type],
        'service': [service],
        'flag': [flag],
        'src_bytes': [src_bytes],
        'dst_bytes': [dst_bytes],
        'land': [land],
        'wrong_fragment': [wrong_fragment],
        'urgent': [urgent],
        'hot': [hot],
        'num_failed_logins': [num_failed_logins],
        'logged_in': [logged_in],
        'num_compromised': [num_compromised]
    })

     # One-hot encode the input data to match the training data's format
    input_data_encoded = pd.get_dummies(input_data,columns=['protocol_type', 'service', 'flag'], drop_first=True)

    # Align the columns with the training data (X_train_encoded)
    input_data_encoded = input_data_encoded.reindex(columns=X_train.columns, fill_value=0)

    # Predict the output using the trained model
    prediction = model.predict(input_data_encoded)

    # Decode the prediction back to the original class names
    predicted_class = label_encoder.inverse_transform(prediction)

    # Print the prediction result
    print(f"Predicted value for network attack: {predicted_class[0]}")

# Call the function to predict based on user input
predict_user_input()
