In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [7]:

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Create label mapping
label_mapping = {
    'ChatGPT': 0,
    'Reddit': 1,
    'Wikipedia': 2,
    'LinkedIn': 3
}

# Convert label column using mapping
train_df.iloc[:, -1] = train_df.iloc[:, -1].map(label_mapping)
test_df.iloc[:, -1] = test_df.iloc[:, -1].map(label_mapping)

# Check for any NaN values in labels 
print(f"NaN values in train labels: {train_df.iloc[:, -1].isna().sum()}")
print(f"NaN values in test labels: {test_df.iloc[:, -1].isna().sum()}")

# Convert feature columns to numeric and handle NaN values
for col in train_df.columns[:-1]:  # exclude label column
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

# Fill NaN values with column means instead of dropping rows
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

# Verify data types before conversion to tensors
print(f"Data types in train_df:\n{train_df.dtypes}")

# Force convert to int64
train_df.iloc[:, -1] = train_df.iloc[:, -1].astype('int64')
test_df.iloc[:, -1] = test_df.iloc[:, -1].astype('int64')

NaN values in train labels: 0
NaN values in test labels: 0
Data types in train_df:
Packet Count                    int64
Total Length                    int64
Avg Interval (s)              float64
Max Interval (s)              float64
Min Interval (s)              float64
Avg Length (bytes)            float64
Max Length (bytes)              int64
Min Length (bytes)              int64
Most Common Length (bytes)      int64
Label                           int64
dtype: object


In [8]:
X_train = train_df.drop(columns=['Label'])   
y_train = train_df['Label']                  

X_test = test_df.drop(columns=['Label'])    
y_test = test_df['Label']

rf = RandomForestClassifier(
    n_estimators=200,   
    max_depth=20,       
    random_state=123     
)

rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)


acc = accuracy_score(y_test, y_pred)
print(f"Random Forest Test Accuracy: {acc*100:.2f}%")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Random Forest Test F1 Score: {f1*100:.2f}%")


Random Forest Test Accuracy: 90.91%
Random Forest Test F1 Score: 90.35%


In [None]:

# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_mapping.keys())
# disp.plot()
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np

# --- Style Definitions (NYT Inspired - Reuse from previous) ---
# ... (style definitions remain the same) ...
colors = {
    'bar_lin': 'rgba(70, 130, 180, 0.8)',
    'bar_log': 'rgba(210, 105, 30, 0.8)',
    'grid': '#d9d9d9',
    'text': '#333333',
    'background': '#ffffff',
    'axis_line': '#cccccc',
    'heatmap': 'Blues' # Or 'Greys', 'YlGnBu', etc.
}
font_family = "Arial, sans-serif"
base_layout = go.Layout( # Shared layout elements
     # ... (base_layout definition remains the same) ...
    font=dict(family=font_family, size=12, color=colors['text']),
    title_font_size=18,
    plot_bgcolor=colors['background'],
    paper_bgcolor=colors['background'],
    yaxis=dict(showgrid=True, gridcolor=colors['grid'], gridwidth=1, showline=False, ticks='outside', tickcolor=colors['axis_line'], zeroline=False),
    xaxis=dict(showgrid=False, showline=True, linecolor=colors['axis_line'], linewidth=1, ticks='outside', tickcolor=colors['axis_line'], zeroline=False),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, bgcolor='rgba(0,0,0,0)', bordercolor=colors['background']),
    margin=dict(l=80, r=40, t=80, b=60)
)


# Basic metrics
rf_accuracy = accuracy_score(y_test, y_pred)
rf_f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

# Classification report
website_names = sorted(y_test.unique())  # Ensure consistent label order
report_rf_dict = classification_report(y_test, y_pred, target_names=website_names, output_dict=True, zero_division=0)

models = ['Random Forest']
accuracies = [rf_accuracy]
f1_macros = [rf_f1_macro]

fig_metrics = go.Figure(layout=base_layout)

fig_metrics.add_trace(go.Bar(
    x=models,
    y=accuracies,
    name='Accuracy',
    marker_color=colors['bar_lin'],
    text=[f'{acc:.3f}' for acc in accuracies],
    textposition='auto',
    width=0.3
))

fig_metrics.add_trace(go.Bar(
    x=models,
    y=f1_macros,
    name='Macro F1-Score',
    marker_color=colors['bar_log'],
    text=[f'{f1:.3f}' for f1 in f1_macros],
    textposition='auto',
    width=0.3
))

fig_metrics.update_layout(
    title_text='Random Forest Performance',
    xaxis_title='Model',
    yaxis_title='Score',
    yaxis_range=[0, 1.05],
    yaxis_tickformat=".2f",
    barmode='group',
    legend_title_text='Metric'
)

fig_metrics.show()


In [24]:
import numpy as np
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

# === Step 1: Define your label mapping (string → int)
label_mapping = {
    'ChatGPT': 0,
    'Reddit': 1,
    'Wikipedia': 2,
    'LinkedIn': 3
}

# === Step 2: Invert mapping to get int → string (for display labels)
index_to_label = {v: k for k, v in label_mapping.items()}

# === Step 3: Use numeric labels for confusion matrix
numeric_labels = list(index_to_label.keys())  # [0, 1, 2, 3]
cm = confusion_matrix(y_test, y_pred, labels=numeric_labels)

# === Step 4: Get the website names in correct numeric order
website_names = [index_to_label[i] for i in numeric_labels]

# === Step 5: Create the heatmap
def create_confusion_matrix_heatmap(cm, labels, title, color_scale='Blues'):
    text_values = [[str(val) for val in row] for row in cm]

    fig = ff.create_annotated_heatmap(
        z=cm,
        x=labels,
        y=labels,
        annotation_text=text_values,
        colorscale=color_scale,
        showscale=True
    )

    fig.update_layout(
        title_text=title,
        xaxis_title="Predicted Label",
        yaxis_title="True Label",
        xaxis_side="bottom",
        yaxis_autorange='reversed',
        xaxis=dict(tickangle=-45),
        margin=dict(l=100, r=50, t=100, b=100),
        font=dict(family='Arial', size=12, color='black'),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    fig.update_xaxes(type='category')
    fig.update_yaxes(type='category')
    return fig

# === Step 6: Plot the figure
fig_cm = create_confusion_matrix_heatmap(
    cm, website_names, 'Confusion Matrix: Random Forest', color_scale='Blues'
)

fig_cm.show()



