## Imports

In [81]:
import polars as pl
from dotenv import load_dotenv
import polars.selectors as cs
from strip_markdown import strip_markdown
from google import genai
import os

## Environment Variables

In [28]:
load_dotenv(dotenv_path='secret.env')
GEMINI_API_KEY= os.getenv("gem")

# Dataset Deals

In [23]:
df = pl.read_csv("data/Project_Phase1.csv")


In [24]:
df.shape

(14036, 31)

In [20]:
df.schema

Schema([('syn_error_rate', Float64),
        ('connection_time', Int64),
        ('destination_same_source_port_rate', Float64),
        ('destination_different_server_rate', Float64),
        ('connection_status', String),
        ('connection_count', Int64),
        ('destination_server_different_host_rate', Float64),
        ('suspicious_activity', Int64),
        ('protocol', String),
        ('server_different_host_rate', Float64),
        ('destination_server_syn_error_rate', Float64),
        ('destination_host_server_count', Int64),
        ('destination_same_server_rate', Float64),
        ('fragment_errors', Int64),
        ('compromised_count', Int64),
        ('source_bytes', Int64),
        ('service_type', String),
        ('destination_syn_error_rate', Float64),
        ('same_server_rate', Float64),
        ('reset_error_rate', Float64),
        ('server_request_count', Int64),
        ('destination_server_reset_error_rate', Float64),
        ('server_reset_error_rate',

In [86]:
df.describe()

statistic,syn_error_rate,connection_time,destination_same_source_port_rate,destination_different_server_rate,connection_status,connection_count,destination_server_different_host_rate,suspicious_activity,protocol,server_different_host_rate,destination_server_syn_error_rate,destination_host_server_count,destination_same_server_rate,fragment_errors,compromised_count,source_bytes,service_type,destination_syn_error_rate,same_server_rate,reset_error_rate,server_request_count,destination_server_reset_error_rate,server_reset_error_rate,server_syn_error_rate,destination_bytes,guest_login,authentication_status,destination_host_count,destination_reset_error_rate,different_server_rate,class
str,f64,f64,f64,f64,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""count""",14036.0,14036.0,14036.0,14036.0,"""14036""",14036.0,14036.0,14036.0,"""14036""",14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,"""14036""",14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,"""14036"""
"""null_count""",0.0,0.0,0.0,0.0,"""0""",0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",0.039788,184.191294,0.122315,0.044255,,28.048019,0.025829,0.221288,,0.122144,0.032572,182.678755,0.783955,0.001567,0.393346,11341.807922,,0.039907,0.94019,0.048265,27.400328,0.048468,0.049172,0.037901,4226.110074,0.012183,0.681177,151.48625,0.050431,0.032923,
"""std""",0.183392,1448.60408,0.257433,0.135233,,63.798968,0.07139,2.327768,,0.266557,0.168483,97.214193,0.348346,0.06751,13.949886,169682.168932,,0.180893,0.211918,0.211091,60.297916,0.20034,0.211871,0.179243,68334.655301,0.109706,0.466037,101.76511,0.203809,0.151932,
"""min""",0.0,0.0,0.0,0.0,"""OTH""",1.0,0.0,0.0,"""icmp""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""IRC""",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""anomaly"""
"""25%""",0.0,0.0,0.0,0.0,,1.0,0.0,0.0,,0.0,0.0,93.0,0.65,0.0,0.0,78.0,,0.0,1.0,0.0,2.0,0.0,0.0,0.0,72.0,0.0,0.0,43.0,0.0,0.0,
"""50%""",0.0,0.0,0.01,0.0,,5.0,0.0,0.0,,0.0,0.0,254.0,1.0,0.0,0.0,229.0,,0.0,1.0,0.0,6.0,0.0,0.0,0.0,347.0,0.0,1.0,168.0,0.0,0.0,
"""75%""",0.0,0.0,0.08,0.02,,15.0,0.03,0.0,,0.1,0.0,255.0,1.0,0.0,0.0,323.0,,0.0,1.0,0.0,18.0,0.0,0.0,0.0,1924.0,0.0,1.0,255.0,0.0,0.0,
"""max""",1.0,41476.0,1.0,1.0,"""SF""",511.0,1.0,77.0,"""udp""",1.0,1.0,255.0,1.0,3.0,884.0,7665876.0,"""whois""",1.0,1.0,1.0,511.0,1.0,1.0,1.0,5131424.0,1.0,1.0,255.0,1.0,1.0,"""normal"""


## User Defined Functions (UDF)

In [69]:
def skimmer(df:pl.DataFrame)->dict:
    out ={}
    for col in df.columns:
        if(df[col].dtype == pl.Utf8):
            out[f"{col}"] =  {"col_unique_values": df[col].unique().to_list() , "data_type":df[col].dtype}
        else:
            out[f"{col}"] = {"col_values_range": [df[col].min(), df[col].max()],"data_type":df[col].dtype}
    return out
skimmer(df)

{'syn_error_rate': {'col_values_range': [0.0, 1.0], 'data_type': Float64},
 'connection_time': {'col_values_range': [0, 41476], 'data_type': Int64},
 'destination_same_source_port_rate': {'col_values_range': [0.0, 1.0],
  'data_type': Float64},
 'destination_different_server_rate': {'col_values_range': [0.0, 1.0],
  'data_type': Float64},
 'connection_status': {'col_unique_values': ['SF',
   'RSTO',
   'REJ',
   'RSTOS0',
   'S0',
   'S2',
   'RSTR',
   'S3',
   'OTH',
   'S1'],
  'data_type': String},
 'connection_count': {'col_values_range': [1, 511], 'data_type': Int64},
 'destination_server_different_host_rate': {'col_values_range': [0.0, 1.0],
  'data_type': Float64},
 'suspicious_activity': {'col_values_range': [0, 77], 'data_type': Int64},
 'protocol': {'col_unique_values': ['tcp', 'udp', 'icmp'],
  'data_type': String},
 'server_different_host_rate': {'col_values_range': [0.0, 1.0],
  'data_type': Float64},
 'destination_server_syn_error_rate': {'col_values_range': [0.0, 1.0],


In [None]:
def TheAnalyzer(QuickSchema:dict) -> str:
    client = genai.Client(api_key=GEMINI_API_KEY)
    
    response = client.models.generate_content(
    model="gemini-2.5-pro-exp-03-25",
    contents=f"Explain the schema of this dataset {skimmer(df)} , also can u get which dataset it origins from approximately",
    )
    return strip_markdown(response.text)
print(TheAnalyzer(skimmer(df)))

## Data Description

## SyntaxTesting

In [65]:
df["protocol"].unique()

protocol
str
"""icmp"""
"""tcp"""
"""udp"""


In [49]:
df.select(cs.string()).unique()

connection_status,protocol,service_type,class
str,str,str,str
"""SF""","""tcp""","""finger""","""normal"""
"""S0""","""tcp""","""exec""","""anomaly"""
"""S0""","""tcp""","""telnet""","""normal"""
"""REJ""","""tcp""","""Z39_50""","""anomaly"""
"""SF""","""tcp""","""IRC""","""normal"""
…,…,…,…
"""S0""","""tcp""","""auth""","""anomaly"""
"""S3""","""tcp""","""ftp_data""","""normal"""
"""REJ""","""tcp""","""iso_tsap""","""anomaly"""
"""SF""","""tcp""","""auth""","""normal"""


In [83]:
html = strip_markdown("#your_text_string\nt")
print(html)

your_text_string
t
