In [None]:
# Cell 1: imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Cell 2: load dataset (replace path with local CSV or Kaggle link)
# Example: breast_cancer.csv (scikit-learn or Kaggle)
df = pd.read_csv("data/breast_cancer_wisconsin.csv")
# Inspect
print(df.shape)
df.head()

# Cell 3: basic preprocessing
# Drop ID if present, handle missing, encode target
if 'id' in df.columns:
    df = df.drop(columns=['id'])
# Example target column name 'diagnosis' with 'M'/'B'
if 'diagnosis' in df.columns:
    df['target'] = df['diagnosis'].map({'M':1, 'B':0})
    df = df.drop(columns=['diagnosis'])
else:
    # try common name
    pass

# Fill/clean missing
df = df.dropna()

# Features and labels
X = df.drop(columns=['target'])
y = df['target']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cell 4: train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Cell 5: train RandomForest
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Cell 6: evaluate (accuracy + F1)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]  # probability of positive class

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", acc)
print("F1-score:", f1)
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Cell 7: Map to 'issue priority' (example mapping)
# Suppose: high priority if model predicts positive with prob > 0.85,
# medium if 0.6 < prob <= 0.85, else low.
def prob_to_priority(p):
    if p >= 0.85:
        return "high"
    elif p >= 0.6:
        return "medium"
    else:
        return "low"

priority_labels = [prob_to_priority(p) for p in y_proba]

# For evaluation, map true labels to a priority baseline (example)
# This is synthetic â€” ideally you have true issue priority labels to evaluate.
