In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Build paths to train.csv and test.csv relative to the current file
train_path = 'train.csv'
test_path = 'test.csv'

# --- Load Data ---
try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print("Successfully loaded train.csv and test.csv")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found. Make sure they are in the 'sortedData' directory.")
    exit()

# --- Prepare Data ---
# Separate features (X) and labels (y)
X_train = train_df.drop(columns=['Label'])
y_train_labels = train_df['Label']
X_test = test_df.drop(columns=['Label'])
y_test_labels = test_df['Label']

# Handle potential NaN values by filling with the mean (as seen in provided notebooks)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean()) # Use train mean for test set consistency, or test mean

# Encode Labels: Convert website names (strings) to numbers
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(y_test_labels) # Use the same encoder fitted on training data

print(f"\nLabels observed: {label_encoder.classes_}")
print(f"Encoded labels mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Get website names for plotting later
website_names = label_encoder.classes_

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use scaler fitted on training data


# --- Linear Regression ---
# As discussed, Linear Regression is not ideal for classification,
# but we implement it as per the C400 report baseline comparison.
print("\n--- Linear Regression (Illustrative, Not Recommended for Classification) ---")
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Predict and round to nearest integer for classification (basic approach)
y_pred_lin_raw = lin_reg.predict(X_test_scaled)
y_pred_lin = np.round(y_pred_lin_raw).astype(int)
# Clip predictions to be within the valid range of encoded labels
y_pred_lin = np.clip(y_pred_lin, 0, len(label_encoder.classes_) - 1)

lin_accuracy = accuracy_score(y_test, y_pred_lin)
# Use zero_division=0 for Macro F1 calculation as in the report
lin_f1_macro = f1_score(y_test, y_pred_lin, average='macro', zero_division=0)

print(f"Linear Regression Test Accuracy (approx): {lin_accuracy:.2f}")
print(f"Linear Regression Test Macro F1 (approx): {lin_f1_macro:.2f}")
print("\nClassification Report (Linear Regression - Rounded):")
# Use zero_division=0 for classification report
print(classification_report(y_test, y_pred_lin, target_names=label_encoder.classes_, zero_division=0))


# --- Logistic Regression ---
# This is a standard baseline for classification problems.
print("\n--- Logistic Regression ---")
# Using parameters often suitable for multi-class problems
log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_pred_log = log_reg.predict(X_test_scaled)

log_accuracy = accuracy_score(y_test, y_pred_log)
# Calculate Macro F1 score as used in the C400 report
log_f1_macro = f1_score(y_test, y_pred_log, average='macro', zero_division=0)

print(f"Logistic Regression Test Accuracy: {log_accuracy:.2f}") # Compare with C400 report's 0.83
print(f"Logistic Regression Test Macro F1: {log_f1_macro:.2f}")   # Compare with C400 report's 0.84
print("\nClassification Report (Logistic Regression):")
# Use zero_division=0 for classification report
print(classification_report(y_test, y_pred_log, target_names=label_encoder.classes_, zero_division=0))

Successfully loaded train.csv and test.csv

Labels observed: ['ChatGPT' 'LinkedIn' 'Reddit' 'Wikipedia']
Encoded labels mapping: {'ChatGPT': np.int64(0), 'LinkedIn': np.int64(1), 'Reddit': np.int64(2), 'Wikipedia': np.int64(3)}

--- Linear Regression (Illustrative, Not Recommended for Classification) ---
Linear Regression Test Accuracy (approx): 0.55
Linear Regression Test Macro F1 (approx): 0.58

Classification Report (Linear Regression - Rounded):
              precision    recall  f1-score   support

     ChatGPT       0.67      1.00      0.80         2
    LinkedIn       0.60      0.50      0.55         6
      Reddit       0.25      0.40      0.31         5
   Wikipedia       0.83      0.56      0.67         9

    accuracy                           0.55        22
   macro avg       0.59      0.61      0.58        22
weighted avg       0.62      0.55      0.56        22


--- Logistic Regression ---
Logistic Regression Test Accuracy: 0.91
Logistic Regression Test Macro F1: 0.87

C





In [8]:
import plotly.graph_objects as go 
y_test_names_lin = label_encoder.inverse_transform(y_test)
y_pred_names_lin = label_encoder.inverse_transform(y_pred_lin)

fig_lin = go.Figure()
# Add trace for actual values
fig_lin.add_trace(go.Scatter(
    y=y_test_names_lin,
    mode='lines+markers',
    name='Actual Values',
    line=dict(color='lightgrey', width=2),
    marker=dict(symbol='x', size=8)
))
# Add trace for predicted values
fig_lin.add_trace(go.Scatter(
    y=y_pred_names_lin,
    mode='lines+markers',
    name='Predicted Values (Linear Reg)',
    line=dict(color='royalblue', width=2),
     marker=dict(symbol='circle', size=8)
))
fig_lin.update_layout(
    title='Linear Regression: Predicted vs. Actual Website Labels (Test Set)',
    xaxis_title='Test Sample Index',
    yaxis_title='Website Label',
    yaxis=dict(categoryorder='array', categoryarray=website_names), # Ensure correct categorical order
    legend_title='Legend'
)
fig_lin.show()
# --- Logistic Regression ---
print("\n--- Logistic Regression ---")
log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_pred_log = log_reg.predict(X_test_scaled)

log_accuracy = accuracy_score(y_test, y_pred_log)
log_f1_macro = f1_score(y_test, y_pred_log, average='macro', zero_division=0)

print(f"Logistic Regression Test Accuracy: {log_accuracy:.2f}")
print(f"Logistic Regression Test Macro F1: {log_f1_macro:.2f}")
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_log, target_names=website_names, zero_division=0))


# --- Plotting Logistic Regression Results ---
# Inverse transform numeric labels back to website names for plotting
y_test_names_log = label_encoder.inverse_transform(y_test)
y_pred_names_log = label_encoder.inverse_transform(y_pred_log)

fig_log = go.Figure()
# Add trace for actual values
fig_log.add_trace(go.Scatter(
    y=y_test_names_log,
    mode='lines+markers',
    name='Actual Values',
    line=dict(color='lightgrey', width=2),
    marker=dict(symbol='x', size=8)
))
# Add trace for predicted values
fig_log.add_trace(go.Scatter(
    y=y_pred_names_log,
    mode='lines+markers',
    name='Predicted Values (Logistic Reg)',
    line=dict(color='darkorange', width=2),
    marker=dict(symbol='circle', size=8)
))
fig_log.update_layout(
    title='Logistic Regression: Predicted vs. Actual Website Labels (Test Set)',
    xaxis_title='Test Sample Index',
    yaxis_title='Website Label',
    yaxis=dict(categoryorder='array', categoryarray=website_names), # Ensure correct categorical order
    legend_title='Legend'
)
fig_log.show()





--- Logistic Regression ---
Logistic Regression Test Accuracy: 0.91
Logistic Regression Test Macro F1: 0.87

Classification Report (Logistic Regression):
              precision    recall  f1-score   support

     ChatGPT       0.67      1.00      0.80         2
    LinkedIn       1.00      1.00      1.00         6
      Reddit       1.00      0.60      0.75         5
   Wikipedia       0.90      1.00      0.95         9

    accuracy                           0.91        22
   macro avg       0.89      0.90      0.87        22
weighted avg       0.93      0.91      0.90        22





