In [0]:
#pip install lightgbm

In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import statistics
from sklearn.model_selection import train_test_split
from pyspark.sql.functions import floor, col
from pyspark.ml.feature import StringIndexer
from sklearn.metrics import accuracy_score
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import pickle

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ReadData").getOrCreate()
df = spark.read.csv("s3://hacknyu-fraud-detection-dataset/train_transaction.csv", header=True, inferSchema=True)

In [0]:
df = df.withColumn("TransactionDay", floor(col("TransactionDT") / (24 * 60 * 60)))

In [0]:
df = df.withColumn("D1n", col("TransactionDay") - col("D1"))

In [0]:
df = df.withColumn("D3n", col("TransactionDay") - col("D3"))

In [0]:
df1 = df.select("TransactionID", "isFraud", "TransactionAmt", "card1", "addr1", "D1n", "D3n", "dist1", "P_emaildomain")

In [0]:
display(df1)

In [0]:
df1 = df1.fillna({'P_emaildomain': 'Unknown'})

In [0]:
indexer = StringIndexer(inputCol="P_emaildomain", outputCol="email_encoded")
df1 = indexer.fit(df1).transform(df1)
df1.select("P_emaildomain", "email_encoded").show(5)

In [0]:
df1 = df1.select("TransactionID", "isFraud", "TransactionAmt", "card1", "addr1", "D1n", "D3n", "dist1", "email_encoded")
display(df1)


In [0]:
spark.sql("USE default")  # Switch to the desired schema (database)
df1.write.mode("overwrite").saveAsTable("cleaned_data")

In [0]:
df = spark.table("real_time_fraud_detection.default.cleaned_data")
df.show(5)

In [0]:
df2 = df.toPandas()

In [0]:
df2

In [0]:
df2.info()

In [0]:
y=df2['isFraud']
X=df2.drop(columns=['isFraud'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}

model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100)

y_pred = model.predict(X_test)
y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred]

In [0]:
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy:.4f}")

In [0]:
df3=pd.DataFrame(X_test)
df3['isFraud']=y_pred_binary

In [0]:
df2 = pd.concat([df2, df3], axis=0, ignore_index=True)

In [0]:
df2

In [0]:
df2_spark = spark.createDataFrame(df2)

In [0]:
spark.sql("USE default")  # Switch to the desired schema (database)
spark.sql("DROP TABLE IF EXISTS cleaned_data")
df2_spark.write.mode("overwrite").saveAsTable("cleaned_data")

In [0]:
import pickle
import boto3

# Define AWS S3 bucket and file name
bucket_name = "hacknyu-fraud-detection-dataset"
model_file = "lightgbm_model.pkl"
s3_key="/lightgbm_model.pkl"

# Save LightGBM model locally
with open(model_file, "wb") as f:
    pickle.dump(model, f)

# Upload to S3
s3 = boto3.client("s3")
s3.upload_file(model_file, bucket_name,s3_key)

print(f"✅ Model uploaded to s3://{bucket_name}/{model_file}")
