In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
import re


df = pd.read_parquet("/kaggle/input/bind-reverse-shell-detection-challenge/train.parquet")
df = df.dropna()


def extract_features(shell_str):
    features = {}
    features['has_nc'] = 'nc' in shell_str
    features['has_bash_i'] = 'bash -i' in shell_str
    features['has_tcp'] = '/dev/tcp' in shell_str
    features['has_bind_flag'] = ('-l' in shell_str) or ('--listen' in shell_str) # Simplified
    features['has_redirect'] = ('>&' in shell_str) or ('<&' in shell_str) or ('> /dev' in shell_str)
    
    return features


feature_dicts = df['shell'].apply(extract_features)
df_features = pd.DataFrame(list(feature_dicts))


vectorizer = TfidfVectorizer(
    max_features=1000, 
    ngram_range=(1, 3), 
    min_df=1, 
    max_df=0.95 
)


X_text = vectorizer.fit_transform(df["shell"])
y = df["label"]

classifier = LogisticRegression(random_state=42, max_iter=1000) 

classifier.fit(X_text, y)


df_test = pd.read_parquet("/kaggle/input/bind-reverse-shell-detection-challenge/test.parquet")
shells_test = df_test["shell"].tolist()


X_test_text = vectorizer.transform(shells_test)


predictions = classifier.predict(X_test_text)


df_test["label"] = predictions
submission = df_test[["ID", "label"]]
submission.to_csv("./submission.csv", index=False)


print(submission.shape)

(8654, 2)
