<a href="https://colab.research.google.com/github/21BCS9692/Assignment/blob/main/credit-card-fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('creditcard.csv')

# Basic info
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Class distribution
print(df['Class'].value_counts())
sns.countplot(x='Class', data=df)
plt.title('Class Distribution')
plt.show()

In [None]:
# Feature engineering

# Create time-based features
df['hour'] = df['Time'].apply(lambda x: np.floor(x / 3600) % 24)
df['day'] = df['Time'].apply(lambda x: np.floor(x / (3600 * 24)))

# Transaction frequency features
df['tx_freq_1h'] = df.groupby('hour')['Amount'].transform('count')
df['tx_freq_24h'] = df.groupby('day')['Amount'].transform('count')

# Amount statistics
df['avg_amount_1h'] = df.groupby('hour')['Amount'].transform('mean')
df['max_amount_1h'] = df.groupby('hour')['Amount'].transform('max')

In [None]:
# Drop rows with NaN in either X_train or y_train
nan_mask = X_train.isna().any(axis=1) | y_train.isna()
X_train_clean = X_train[~nan_mask]
y_train_clean = y_train[~nan_mask]

print(f"Removed {nan_mask.sum()} rows with NaN values")
print(f"New shapes - X: {X_train_clean.shape}, y: {y_train_clean.shape}")

In [None]:
# Convert to numpy array for infinite check
X_np = X_train_clean.select_dtypes(include=[np.number]).to_numpy()
print(f"Infinite values in X_train: {np.isinf(X_np).sum()}")

In [None]:
print("\nData types:")
print(X_train_clean.dtypes)
print(f"\ny_train type: {type(y_train_clean)}")

In [None]:
# Ensure all features are numeric
X_train_final = X_train_clean.select_dtypes(include=[np.number])
y_train_final = y_train_clean.astype(int)  # Ensure target is integer

# Verify no nulls remain
assert not X_train_final.isna().any().any()
assert not y_train_final.isna().any()

In [None]:
# Ensure all features are numeric
X_train_final = X_train_clean.select_dtypes(include=[np.number])
y_train_final = y_train_clean.astype(int)  # Ensure target is integer

# Verify no nulls remain
assert not X_train_final.isna().any().any()
assert not y_train_final.isna().any()

In [None]:
from imblearn.over_sampling import SMOTE

try:
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train_final, y_train_final)
    print("\nSMOTE successful!")
    print(f"Resampled shapes - X: {X_res.shape}, y: {y_res.shape}")
    print("Class distribution:", pd.Series(y_res).value_counts())
except Exception as e:
    print(f"\nSMOTE failed with error: {str(e)}")
    print("Troubleshooting steps:")
    print("1. Ensure all values are finite (run np.isfinite(X_train_final.to_numpy()).all())")
    print("2. Verify y has exactly 2 classes:", np.unique(y_train_final))

In [None]:
print("Final validation:")
print("NaN in X:", X_res.isna().sum().sum())
print("NaN in y:", y_res.isna().sum())
print("Class balance:", y_res.value_counts())

In [None]:
plt.scatter(X_res.iloc[:, 0], X_res.iloc[:, 1], c=y_res)
plt.title("Resampled Data")
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Reload your data with forced numeric conversion
df = pd.read_csv('creditcard.csv', engine='python')

# Force-convert all features to numeric and drop problematic rows
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric to NaN

df = df.dropna()  # Remove ALL rows with any NaN values
X = df.drop('Class', axis=1)
y = df['Class'].astype(int)  # Critical: ensure target is integer (0/1)

print(f"Final clean data: {X.shape[0]} samples")
print("Class balance:\n", y.value_counts())

In [None]:
from sklearn.utils import resample

# Split data first (avoid data leakage)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manual oversampling (works when SMOTE fails)
fraud = X_train[y_train == 1]
non_fraud = X_train[y_train == 0]

fraud_upsampled = resample(fraud,
                          replace=True,  # Important for oversampling
                          n_samples=len(non_fraud),  # Match majority class
                          random_state=42)

X_res = pd.concat([non_fraud, fraud_upsampled])
y_res = pd.Series([0]*len(non_fraud) + [1]*len(fraud_upsampled))

print("\nResampling successful!")
print(f"New class balance:\n{y_res.value_counts()}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(class_weight='balanced', random_state=42)  # Extra protection
model.fit(X_res, y_res)

# Evaluate
y_pred = model.predict(X_test)
print("\nModel Performance:")
print(classification_report(y_test, y_pred))

In [None]:
print("\nSanity Checks:")
print("NaN in X_res:", X_res.isna().sum().sum())
print("NaN in y_res:", y_res.isna().sum())
print("X dtypes:\n", X_res.dtypes)
print("Unique classes in y:", np.unique(y_res))

In [None]:
X_res.to_csv('X_resampled.csv', index=False)
y_res.to_csv('y_resampled.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Data Loading and Initial Exploration
df = pd.read_csv('creditcard.csv')
print(df.info())
print(df.describe())
print(df.isnull().sum())
print(df['Class'].value_counts())

# 2. Feature Engineering
df['hour'] = df['Time'].apply(lambda x: np.floor(x / 3600) % 24)
df['day'] = df['Time'].apply(lambda x: np.floor(x / (3600 * 24)))
df['tx_freq_1h'] = df.groupby('hour')['Amount'].transform('count')
df['tx_freq_24h'] = df.groupby('day')['Amount'].transform('count')
df['avg_amount_1h'] = df.groupby('hour')['Amount'].transform('mean')
df['max_amount_1h'] = df.groupby('hour')['Amount'].transform('max')

# 3. Data Cleaning
df = df.dropna()
X = df.drop('Class', axis=1)
y = df['Class'].astype(int)

# 4. Train-Test Split (THIS MUST COME BEFORE ANY X_train/y_train OPERATIONS)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Handle Missing Values (Now we can use X_train/y_train)
nan_mask = X_train.isna().any(axis=1) | y_train.isna()
X_train_clean = X_train[~nan_mask]
y_train_clean = y_train[~nan_mask]

# 6. Resampling
try:
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train_clean, y_train_clean)
except Exception as e:
    print(f"SMOTE failed: {e}")
    # Manual resampling fallback
    fraud = X_train_clean[y_train_clean == 1]
    non_fraud = X_train_clean[y_train_clean == 0]
    fraud_upsampled = resample(fraud, replace=True, n_samples=len(non_fraud), random_state=42)
    X_res = pd.concat([non_fraud, fraud_upsampled])
    y_res = pd.Series([0]*len(non_fraud) + [1]*len(fraud_upsampled))

# 7. Model Training
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_res, y_res)

# 8. Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 9. Save Results
X_res.to_csv('X_resampled.csv', index=False)
y_res.to_csv('y_resampled.csv', index=False)