In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Build paths to train.csv and test.csv relative to the current file
train_path = 'train.csv'
test_path = 'test.csv'

# --- Load Data ---
try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print("Successfully loaded train.csv and test.csv")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found. Make sure they are in the 'sortedData' directory.")
    exit()

# --- Prepare Data ---
# Separate features (X) and labels (y)
X_train = train_df.drop(columns=['Label'])
y_train_labels = train_df['Label']
X_test = test_df.drop(columns=['Label'])
y_test_labels = test_df['Label']

# Handle potential NaN values by filling with the mean (as seen in provided notebooks)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean()) # Use train mean for test set consistency, or test mean

# Encode Labels: Convert website names (strings) to numbers
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(y_test_labels) # Use the same encoder fitted on training data

print(f"\nLabels observed: {label_encoder.classes_}")
print(f"Encoded labels mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Get website names for plotting later
website_names = label_encoder.classes_

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use scaler fitted on training data


# --- Linear Regression ---
# As discussed, Linear Regression is not ideal for classification,
# but we implement it as per the C400 report baseline comparison.
print("\n--- Linear Regression (Illustrative, Not Recommended for Classification) ---")
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Predict and round to nearest integer for classification (basic approach)
y_pred_lin_raw = lin_reg.predict(X_test_scaled)
y_pred_lin = np.round(y_pred_lin_raw).astype(int)
# Clip predictions to be within the valid range of encoded labels
y_pred_lin = np.clip(y_pred_lin, 0, len(label_encoder.classes_) - 1)

lin_accuracy = accuracy_score(y_test, y_pred_lin)
# Use zero_division=0 for Macro F1 calculation as in the report
lin_f1_macro = f1_score(y_test, y_pred_lin, average='macro', zero_division=0)

print(f"Linear Regression Test Accuracy (approx): {lin_accuracy:.2f}")
print(f"Linear Regression Test Macro F1 (approx): {lin_f1_macro:.2f}")
print("\nClassification Report (Linear Regression - Rounded):")
# Use zero_division=0 for classification report
print(classification_report(y_test, y_pred_lin, target_names=label_encoder.classes_, zero_division=0))


# --- Logistic Regression ---
# This is a standard baseline for classification problems.
print("\n--- Logistic Regression ---")
# Using parameters often suitable for multi-class problems
log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_pred_log = log_reg.predict(X_test_scaled)

log_accuracy = accuracy_score(y_test, y_pred_log)
# Calculate Macro F1 score as used in the C400 report
log_f1_macro = f1_score(y_test, y_pred_log, average='macro', zero_division=0)

print(f"Logistic Regression Test Accuracy: {log_accuracy:.2f}") # Compare with C400 report's 0.83
print(f"Logistic Regression Test Macro F1: {log_f1_macro:.2f}")   # Compare with C400 report's 0.84
print("\nClassification Report (Logistic Regression):")
# Use zero_division=0 for classification report
print(classification_report(y_test, y_pred_log, target_names=label_encoder.classes_, zero_division=0))

Successfully loaded train.csv and test.csv

Labels observed: ['ChatGPT' 'LinkedIn' 'Reddit' 'Wikipedia']
Encoded labels mapping: {'ChatGPT': np.int64(0), 'LinkedIn': np.int64(1), 'Reddit': np.int64(2), 'Wikipedia': np.int64(3)}

--- Linear Regression (Illustrative, Not Recommended for Classification) ---
Linear Regression Test Accuracy (approx): 0.55
Linear Regression Test Macro F1 (approx): 0.58

Classification Report (Linear Regression - Rounded):
              precision    recall  f1-score   support

     ChatGPT       0.67      1.00      0.80         2
    LinkedIn       0.60      0.50      0.55         6
      Reddit       0.25      0.40      0.31         5
   Wikipedia       0.83      0.56      0.67         9

    accuracy                           0.55        22
   macro avg       0.59      0.61      0.58        22
weighted avg       0.62      0.55      0.56        22


--- Logistic Regression ---
Logistic Regression Test Accuracy: 0.91
Logistic Regression Test Macro F1: 0.87

C





In [4]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import numpy as np
import pandas as pd

lin_accuracy = accuracy_score(y_test, y_pred_lin)
lin_f1_macro = f1_score(y_test, y_pred_lin, average='macro', zero_division=0)
report_lin_dict = classification_report(y_test, y_pred_lin, target_names=website_names, zero_division=0, output_dict=True)

# Logistic Regression - Calculate necessary metrics
# Ensure log_reg and y_pred_log exist from previous steps
if 'log_reg' not in locals() or 'y_pred_log' not in locals():
    print("Re-running Logistic Regression fitting/prediction...")
    log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000, random_state=42)
    log_reg.fit(X_train_scaled, y_train)
    y_pred_log = log_reg.predict(X_test_scaled)

log_accuracy = accuracy_score(y_test, y_pred_log)
log_f1_macro = f1_score(y_test, y_pred_log, average='macro', zero_division=0)
report_log_dict = classification_report(y_test, y_pred_log, target_names=website_names, zero_division=0, output_dict=True)


# --- Style Definitions (NYT Inspired - Reuse from previous) ---
# ... (style definitions remain the same) ...
colors = {
    'bar_lin': 'rgba(70, 130, 180, 0.8)',
    'bar_log': 'rgba(210, 105, 30, 0.8)',
    'grid': '#d9d9d9',
    'text': '#333333',
    'background': '#ffffff',
    'axis_line': '#cccccc',
    'heatmap': 'Blues' # Or 'Greys', 'YlGnBu', etc.
}
font_family = "Arial, sans-serif"
base_layout = go.Layout( # Shared layout elements
     # ... (base_layout definition remains the same) ...
    font=dict(family=font_family, size=12, color=colors['text']),
    title_font_size=18,
    plot_bgcolor=colors['background'],
    paper_bgcolor=colors['background'],
    yaxis=dict(showgrid=True, gridcolor=colors['grid'], gridwidth=1, showline=False, ticks='outside', tickcolor=colors['axis_line'], zeroline=False),
    xaxis=dict(showgrid=False, showline=True, linecolor=colors['axis_line'], linewidth=1, ticks='outside', tickcolor=colors['axis_line'], zeroline=False),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, bgcolor='rgba(0,0,0,0)', bordercolor=colors['background']),
    margin=dict(l=80, r=40, t=80, b=60)
)


# --- Visualization 1: Bar Chart of Overall Metrics ---

models = ['Linear Regression', 'Logistic Regression']
accuracies = [lin_accuracy, log_accuracy]
f1_macros = [lin_f1_macro, log_f1_macro]

fig_metrics = go.Figure(layout=base_layout)

fig_metrics.add_trace(go.Bar(
    x=models,
    y=accuracies,
    name='Accuracy',
    marker_color=colors['bar_lin'], # Use one color for Accuracy
    text=[f'{acc:.3f}' for acc in accuracies], # Format text on bars
    textposition='auto',
    width=0.3 # Adjust bar width
))

fig_metrics.add_trace(go.Bar(
    x=models,
    y=f1_macros,
    name='Macro F1-Score',
    marker_color=colors['bar_log'], # Use another color for F1
    text=[f'{f1:.3f}' for f1 in f1_macros],
    textposition='auto',
    width=0.3
))

fig_metrics.update_layout(
    title_text='Overall Model Performance Comparison',
    xaxis_title='Model',
    yaxis_title='Score',
    yaxis_range=[0, 1.05], # Set y-axis range 0 to 1 (or slightly above)
    yaxis_tickformat=".2f", # Format y-axis ticks
    barmode='group', # Group bars side-by-side
    legend_title_text='Metric'
)

fig_metrics.show()


# --- Visualization 2: Bar Chart of Per-Class F1-Scores ---

# Extract per-class F1 scores
f1_lin_per_class = [report_lin_dict[label]['f1-score'] for label in website_names]
f1_log_per_class = [report_log_dict[label]['f1-score'] for label in website_names]

fig_f1_class = go.Figure(layout=base_layout)

fig_f1_class.add_trace(go.Bar(
    x=website_names,
    y=f1_lin_per_class,
    name='Linear Regression',
    marker_color=colors['bar_lin'],
    text=[f'{f1:.2f}' for f1 in f1_lin_per_class],
    textposition='outside' # Place text outside bars if possible
))

fig_f1_class.add_trace(go.Bar(
    x=website_names,
    y=f1_log_per_class,
    name='Logistic Regression',
    marker_color=colors['bar_log'],
    text=[f'{f1:.2f}' for f1 in f1_log_per_class],
    textposition='outside'
))

fig_f1_class.update_layout(
    title_text='Per-Class F1-Score Comparison',
    xaxis_title='Website Label (Class)',
    yaxis_title='F1-Score',
    yaxis_range=[0, 1.05],
    yaxis_tickformat=".2f",
    barmode='group',
    legend_title_text='Model',
    xaxis_tickangle=-45 # Angle labels if they overlap
)

fig_f1_class.show()



# --- Visualization 3: Confusion Matrix Heatmaps ---

def create_confusion_matrix_heatmap(cm, labels, title, color_scale):
    """Helper function to create a styled confusion matrix heatmap."""

    x_labels = list(labels)
    y_labels = list(labels)

    z_values = cm
    # Ensure text_values are strings for annotation
    text_values = [[str(val) for val in row] for row in cm]

    # ****** FIX: Remove manual font_colors calculation ******
    # Let figure_factory handle text color automatically for contrast.
    # If you need specific colors, provide just two, e.g.:
    # custom_font_colors = ['black', 'white'] # For text on light / dark cells
    # Else, just remove the font_colors argument below.

    fig = ff.create_annotated_heatmap(
        z=z_values,
        x=x_labels,
        y=y_labels,
        annotation_text=text_values, # Use string version
        colorscale=color_scale,
        showscale=True,
        # font_colors=custom_font_colors # Optional: Uncomment ONLY if specifying 2 colors
    )

    # Apply base layout styling and specific adjustments
    fig.update_layout(base_layout) # Apply base layout FIRST
    fig.update_layout( # Then apply specific overrides/additions
        title_text=title,
        xaxis_title="Predicted Label",
        yaxis_title="True Label",
        xaxis_side="bottom",
        yaxis_autorange='reversed', # Keep standard CM orientation
        margin=dict(l=100, r=50, t=100, b=100), # Adjust margins if labels overlap
        xaxis = dict(tickangle = -45) # Angle ticks if needed for long labels
    )
    fig.update_layout(showlegend=False) # Ensure legend from base_layout is hidden

    # Ensure axes types are correct (sometimes ff overrides them)
    fig.update_xaxes(type='category')
    fig.update_yaxes(type='category')


    return fig

# Calculate confusion matrices (ensure website_names is defined)
# ... (code to define website_names and calculate cm_lin, cm_log remains the same) ...
if 'website_names' not in locals():
     if 'label_encoder' in locals():
         website_names = label_encoder.classes_
     else:
         raise NameError("Variable 'website_names' or 'label_encoder' is not defined.")

cm_labels_numeric = label_encoder.transform(website_names)
cm_lin = confusion_matrix(y_test, y_pred_lin, labels=cm_labels_numeric)
cm_log = confusion_matrix(y_test, y_pred_log, labels=cm_labels_numeric)


# Create and show heatmaps
fig_cm_lin = create_confusion_matrix_heatmap(
    cm_lin, website_names, 'Confusion Matrix: Linear Regression', colors['heatmap']
)
fig_cm_log = create_confusion_matrix_heatmap(
    cm_log, website_names, 'Confusion Matrix: Logistic Regression', colors['heatmap']
)

fig_cm_lin.show()
fig_cm_log.show()

In [6]:
import plotly.graph_objects as go
import numpy as np

# Ensure the following variables are defined from your notebook execution:
# y_test: The true numeric labels for the test set
# y_pred_lin: The predicted numeric labels from Linear Regression (rounded)
# y_pred_log: The predicted numeric labels from Logistic Regression
# label_encoder: The fitted LabelEncoder instance
# website_names: The array of class names (e.g., ['ChatGPT', 'LinkedIn', ...])

# --- Create Plotly Plot for Linear Regression ---

fig_lin_plotly = go.Figure()

# Add prediction trace
fig_lin_plotly.add_trace(go.Scatter(
    y=y_pred_lin,
    mode='lines+markers',
    name='prediction',
    line=dict(color='navy', width=2),
    marker=dict(size=5)
))

# Add real values trace
fig_lin_plotly.add_trace(go.Scatter(
    y=y_test,
    mode='lines+markers',
    name='real_values',
    line=dict(color='lightcoral', width=2, dash='dash'), # Use dash for differentiation
    marker=dict(size=5),
    opacity=0.8
))

# Update layout
fig_lin_plotly.update_layout(
    title='Linear Regression Multi-class Classification',
    xaxis_title='Sample Index',
    yaxis_title='Website Label',
    yaxis=dict(
        tickmode='array',
        tickvals=np.arange(len(website_names)), # Numeric positions (0, 1, 2, 3)
        ticktext=website_names # Text labels ('ChatGPT', 'LinkedIn', ...)
    ),
    legend_title_text='Legend',
    template='plotly_white' # Or another template like 'plotly_dark', 'ggplot2'
)

fig_lin_plotly.show()


# --- Create Plotly Plot for Logistic Regression ---

fig_log_plotly = go.Figure()

# Add prediction trace
fig_log_plotly.add_trace(go.Scatter(
    y=y_pred_log,
    mode='lines+markers',
    name='prediction',
    line=dict(color='navy', width=2),
    marker=dict(size=5)
))

# Add real values trace
fig_log_plotly.add_trace(go.Scatter(
    y=y_test,
    mode='lines+markers',
    name='real_values',
    line=dict(color='lightcoral', width=2, dash='dash'), # Use dash for differentiation
    marker=dict(size=5),
    opacity=0.8
))

# Update layout
fig_log_plotly.update_layout(
    title='Logistic Regression Multi-class Classification',
    xaxis_title='Sample Index',
    yaxis_title='Website Label',
    yaxis=dict(
        tickmode='array',
        tickvals=np.arange(len(website_names)), # Numeric positions (0, 1, 2, 3)
        ticktext=website_names # Text labels ('ChatGPT', 'LinkedIn', ...)
    ),
    legend_title_text='Legend',
    template='plotly_white' # Or another template
)

fig_log_plotly.show()