In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('server-log.txt', delim_whitespace=True)

# Convert 'Start-Date' and 'Start-Time' to datetime format
df['Start-Time'] = pd.to_datetime(df['Start-Date'] + ' ' + df['Start-Time'])

# Extract hours, minutes, and seconds from 'Start-Time'
df['Hour'] = df['Start-Time'].dt.hour
df['Minute'] = df['Start-Time'].dt.minute
df['Second'] = df['Start-Time'].dt.second

# Process 'Duration' to extract the last two digits as seconds
df['Duration-secs'] = df['Duration'].apply(lambda x: int(x[-2:]))


In [2]:
# Identify attacks that occurred between 8 am and 12 pm
attacks_df = df[(df['Hour'] >= 8) & (df['Hour'] < 12)]


In [3]:
# Identify unusual values (e.g., suspicious services)
suspicious_services = ['http', 'ntp/u', 'eco/i', 'smtp/u', 'scan/u', 'finger', 'domain/u', 'stats']

# Add any other suspicious services to this list
suspicious_attacks = attacks_df[attacks_df['Service'].isin(suspicious_services)]


# Combine Source-IP and Destination-IP to form IP pairs
attacks_df['IP-Pair'] = attacks_df['Source-IP'] + ' -> ' + attacks_df['Distination-IP']

# Set a threshold for suspicious activity
threshold = 100  # This can be adjusted as needed

# Count occurrences of each IP pair
ip_pair_counts = attacks_df['IP-Pair'].value_counts()

# Identify suspicious IP pairs
suspicious_ip_pairs = ip_pair_counts[ip_pair_counts > threshold]

# Filter the original dataframe to include only rows with suspicious IP pairs
suspicious_activities_df = attacks_df[attacks_df['IP-Pair'].isin(suspicious_ip_pairs.index)]

# Group by IP pairs and count occurrences
suspicious_ip_pair_counts = suspicious_activities_df.groupby('IP-Pair').size().reset_index(name='Count')

# Merge the counts back into the suspicious activities dataframe, keeping only one row per IP-Pair
unique_suspicious_activities = pd.merge(suspicious_activities_df.drop_duplicates(subset='IP-Pair'), 
                                        suspicious_ip_pair_counts, 
                                        on='IP-Pair')

# Display the unique suspicious activities with their counts
print("\nUnique Suspicious Activities with Counts:")
print(unique_suspicious_activities)
print(ip_pair_counts)


Unique Suspicious Activities with Counts:
       No  Start-Date          Start-Time  Duration Service Source-Port  \
0    2672  06/03/2014 2014-06-03 08:17:02  00:00:01    http        2594   
1    3588  06/03/2014 2014-06-03 08:43:43  00:00:01    http        9909   
2    3797  06/03/2014 2014-06-03 08:47:27  00:00:01    http       11334   
3    4095  06/03/2014 2014-06-03 08:50:31  00:00:01    http       14447   
4    4554  06/03/2014 2014-06-03 08:53:28  00:00:01    http       19130   
5    4899  06/03/2014 2014-06-03 08:56:00  00:00:01    http       23579   
6    6572  06/03/2014 2014-06-03 09:10:33  00:00:01    http        8253   
7    6680  06/03/2014 2014-06-03 09:12:06  00:00:01    http        8806   
8    7167  06/03/2014 2014-06-03 09:20:21  00:00:01    http       12989   
9    7389  06/03/2014 2014-06-03 09:23:47  00:00:01    http       15458   
10   8568  06/03/2014 2014-06-03 09:41:59  00:00:01    http       27209   
11   9319  06/03/2014 2014-06-03 10:10:46  00:00:01    ht

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks_df['IP-Pair'] = attacks_df['Source-IP'] + ' -> ' + attacks_df['Distination-IP']


In [4]:
import numpy as np

def calculate_entropy(counts):
    total_count = sum(counts)
    probabilities = [count / total_count for count in counts]
    entropy = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy

def detect_anomaly(current_counts, baseline_entropy, threshold=0.5):
    current_entropy = calculate_entropy(current_counts)
    if abs(current_entropy - baseline_entropy) > threshold:
        return True, current_entropy
    return False, current_entropy

# Example data
baseline_entropy = 3.5
threshold = 0.5

is_anomaly, current_entropy = detect_anomaly(ip_pair_counts, baseline_entropy, threshold)
if is_anomaly:
    print(f"Anomaly detected! Current entropy: {current_entropy}")
else:
    print(f"Traffic is normal. Current entropy: {current_entropy}")


Anomaly detected! Current entropy: 8.835617644292986
