
!["process"](etl_process_styled.gif)


In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -------------------------
# 1. Spark Session
# -------------------------
spark = SparkSession.builder.getOrCreate()

# Charger les données CSV
df = spark.read.csv("bcw_data.csv", header=True, inferSchema=True)

# Supprimer colonnes inutiles
bad_cols = [c for c in df.columns if c.startswith("_c") or c == "id"]
df = df.drop(*bad_cols)

# -------------------------
# 2. Label encoding
# -------------------------
from pyspark.sql.functions import when, col
df = df.withColumn("label", when(col("diagnosis") == "M", 1).otherwise(-1))
df = df.drop("diagnosis")

# -------------------------
# 3. Features + normalisation
# -------------------------
feature_cols = [c for c in df.columns if c != "label"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vec")
scaler = StandardScaler(inputCol="features_vec", outputCol="features", withMean=True, withStd=True)

df = assembler.transform(df)
df = scaler.fit(df).transform(df).select("label", "features")

# -------------------------
# 4. Convertir en numpy pour perceptron séquentiel
# -------------------------
X = np.array(df.select("features").rdd.map(lambda r: r[0].toArray()).collect())
y = np.array(df.select("label").rdd.map(lambda r: r[0]).collect())

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 5. Implémentation perceptron
# -------------------------
def perceptron_train(X, y, lr=0.01, epochs=100):
    w = np.zeros(X.shape[1])
    b = 0
    for _ in range(epochs):
        for xi, yi in zip(X, y):
            if yi * (np.dot(w, xi) + b) <= 0:
                w += lr * yi * xi
                b += lr * yi
    return w, b

def perceptron_predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

# Entraînement
w, b = perceptron_train(X_train, y_train, lr=0.01, epochs=20)

# Prédictions
y_pred = perceptron_predict(X_test, w, b)

# -------------------------
# 6. Accuracy
# -------------------------
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy du Perceptron : {acc*100:.2f}%")


✅ Accuracy du Perceptron : 96.49%
