<a href="https://colab.research.google.com/github/Danushika06/Dpt/blob/main/spark_%2B_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1
!pip install -q pyspark openml

from google.colab import drive
drive.mount('/content/drive')

# --- Replaces Kaggle download with OpenML mirror (same schema) ---
import openml, pandas as pd, os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, when, count, isnan
from pyspark.ml.feature import VectorAssembler, StandardScaler

# Download dataset from OpenML and save as creditcard.csv (schema-compatible)
d = openml.datasets.get_dataset(1597)  # "Credit Card Fraud" (ULB)
X, y, _, _ = d.get_data(target=d.default_target_attribute)
ccf = pd.concat([X, y.rename("Class")], axis=1)

path = "/content/openml_ccf"          # keep a folder "path" like KaggleHub returned
os.makedirs(path, exist_ok=True)
file_path = f"{path}/creditcard.csv"  # same filename used downstream
ccf.to_csv(file_path, index=False)
print("📁 Path to dataset files:", path)

spark = SparkSession.builder \
    .appName("DataPreprocessingChallenge") \
    .getOrCreate()

print("✅ Spark session created successfully!")

# Read CSV file
data = spark.read.csv(file_path, header=True, inferSchema=True)

print("✅ Dataset loaded successfully.")
data.printSchema()
data.show(5)

# Count missing values
missing_counts = data.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data.columns])
print("🔍 Missing value count per column:")
missing_counts.show()

# Fill missing numeric columns with mean
numeric_cols = [c for c, t in data.dtypes if t in ['double', 'int']]
for column in numeric_cols:
    mean_value = data.select(mean(col(column))).collect()[0][0]
    data = data.fillna({column: mean_value})

# Fill missing categorical columns (if any)
categorical_cols = [c for c, t in data.dtypes if t == 'string']
for column in categorical_cols:
    data = data.fillna({column: 'Unknown'})

print("✅ Missing values handled.")

for column in numeric_cols:
    data = data.withColumn(column, col(column).cast("double"))

data.printSchema()
print("✅ Data types standardized.")

before = data.count()
data = data.dropDuplicates()
after = data.count()

print(f"🧹 Removed {before - after} duplicate rows.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# After loading the CSV into the Spark DataFrame 'data'
output_parquet_path = "/content/openml_ccf_parquet"
data.write.parquet(output_parquet_path, mode="overwrite")
print(f"✅ Data saved as Parquet at: {output_parquet_path}")

# To load the data faster in subsequent runs, you can use:
# data = spark.read.parquet(output_parquet_path)
# print("✅ Data loaded from Parquet.")
# data.show(5)

In [None]:
# =========================================
# Download and Preprocess Credit Card Fraud Data
# =========================================

# 1️⃣ Import libraries
import os
import openml
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col

# -------------------------------
# 2️⃣ Download dataset (OpenML, same schema)
# -------------------------------
d = openml.datasets.get_dataset(1597)  # "Credit Card Fraud"
X, y, _, _ = d.get_data(target=d.default_target_attribute)
df_ = pd.concat([X, y.rename("Class")], axis=1)

data_path = "/content/openml_ccf_2"
os.makedirs(data_path, exist_ok=True)
csv_file = os.path.join(data_path, "creditcard.csv")
df_.to_csv(csv_file, index=False)
print("Dataset downloaded to:", data_path)

# -------------------------------
# 3️⃣ Initialize Spark
# -------------------------------
spark = SparkSession.builder.appName("CreditCardFraud").getOrCreate()

# -------------------------------
# 4️⃣ Load CSV into Spark DataFrame
# -------------------------------
data = spark.read.csv(csv_file, header=True, inferSchema=True)
print("=== Raw Data Sample ===")
data.show(5)

# -------------------------------
# 5️⃣ Handle Missing Values
# -------------------------------
numeric_cols = [c.name for c in data.schema.fields if str(c.dataType) in ['IntegerType', 'DoubleType']]
data = data.fillna({c: 0 for c in numeric_cols})

# -------------------------------
# 6️⃣ Remove Duplicates
# -------------------------------
data = data.dropDuplicates()

# -------------------------------
# 7️⃣ Feature Engineering
# -------------------------------
# Create 'TransactionHour' from 'Time' (seconds)
data = data.withColumn("TransactionHour", ((col("Time") / 3600) % 24).cast("int"))

# -------------------------------
# 8️⃣ Assemble Features
# -------------------------------
feature_cols = [c for c in data.columns if c not in ['Time', 'Class']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled = assembler.transform(data)

# -------------------------------
# 9️⃣ Standardize Features
# -------------------------------
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(assembled)
scaled_data = scaler_model.transform(assembled)

scaled_data.select("scaled_features").show(5, truncate=False)

# -------------------------------
# 10️⃣ Stop Spark
# -------------------------------
spark.stop()


In [None]:
from pyspark.sql.functions import floor, when, col

# Create hour of transaction
scaled_data = scaled_data.withColumn("Transaction_Hour", floor(col("Time") / 3600))

# Create transaction category
scaled_data = scaled_data.withColumn(
    "Amount_Category",
    when(col("Amount") < 10, "Low")
    .when(col("Amount") < 100, "Medium")
    .otherwise("High")
)

scaled_data.select("Time", "Transaction_Hour", "Amount", "Amount_Category").show(5)
print("✅ Feature engineering completed.")

# Drop complex vector columns before saving
clean_data = scaled_data.drop("features", "scaled_features")

# Define output path
output_path = "/content/cleaned_creditcard_data.csv"

# Save as CSV
clean_data.write.csv(output_path, header=True, mode="overwrite")

print(f"✅ Cleaned dataset saved successfully at: {output_path}")

print("📊 Data Preprocessing Summary")
print("- Missing values handled (numeric: mean, categorical: 'Unknown')")
print("- Data types standardized to double")
print("- Duplicates removed")
print("- Features normalized using StandardScaler")
print("- Engineered features: Transaction_Hour, Amount_Category")
print("✅ Dataset ready for downstream analytics or ML tasks. 🎯")
