In [None]:
import pandas as pd

# Load the uploaded dataset
df = pd.read_csv("creditcard.csv")

# Display basic information and first few rows to understand structure
df.info(), df.head()


In [None]:
from sklearn.preprocessing import StandardScaler

# Drop duplicate rows
df_cleaned = df.drop_duplicates()

# Normalize 'Amount' and 'Time' using StandardScaler
scaler = StandardScaler()
df_cleaned[['norm_time', 'norm_amount']] = scaler.fit_transform(df_cleaned[['Time', 'Amount']])

# Drop original 'Time' and 'Amount' columns (optional for modeling)
df_cleaned = df_cleaned.drop(['Time', 'Amount'], axis=1)

# Display the updated dataframe structure
df_cleaned.info()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load your dataset
data = pd.read_csv('creditcard.csv')

# Remove 'Time' and 'Amount' (or scale them as needed)
features = data.drop(['Class', 'Time'], axis=1)

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA (2 components for 2D plot)
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Apply KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(pca_features)

# Create a DataFrame for visualization
df_plot = pd.DataFrame(pca_features, columns=['V1', 'V2'])
df_plot['Cluster'] = clusters
df_plot['Class'] = data['Class'].values  # Optional: to compare with true labels

# Plot using seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_plot, x='V1', y='V2', hue='Cluster', palette='Set2')
plt.title('KMeans Clusters on PCA Features')
plt.show()



In [None]:
pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set(style="whitegrid")

# Step 2: Exploratory Data Analysis (EDA)

# Count of fraudulent vs. non-fraudulent transactions
class_counts = df_cleaned['Class'].value_counts()

# Percentage of fraud cases
fraud_percentage = (class_counts[1] / class_counts.sum()) * 100

# Plot class distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=class_counts.index, y=class_counts.values, palette=["skyblue", "salmon"])
plt.title(f"Transaction Class Distribution\n(Fraudulent: {fraud_percentage:.4f}%)")
plt.xticks([0, 1], ['Legitimate (0)', 'Fraudulent (1)'])
plt.ylabel("Number of Transactions")
plt.xlabel("Transaction Type")
plt.tight_layout()
plt.show()

In [None]:
pip install xgboost

In [None]:
pip install imblearn

In [None]:
# Final Code for Credit Card Fraud Detection Project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

Load the dataset
df = pd.read_csv("creditcard.csv")

Remove duplicates
df = df.drop_duplicates()

Normalize 'Time' and 'Amount'
scaler = StandardScaler()
df[['norm_time', 'norm_amount']] = scaler.fit_transform(df[['Time', 'Amount']])
df = df.drop(['Time', 'Amount'], axis=1)

Define features and label
X = df.drop('Class', axis=1)
y = df['Class']

Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

Initialize models
models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

Fit and evaluate models
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"\n{name} Results:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))



In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay

y_pred = rf_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

RocCurveDisplay.from_estimator(rf_model, X_test, y_test)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('creditcard.csv')

# Define features and target
X = data.drop(['Class', 'Time'], axis=1)
y = data['Class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Create a DataFrame
feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp = feat_imp.sort_values(by='Importance', ascending=True).tail(10)  # Top 10 features

# Plot
plt.figure(figsize=(8, 6))
feat_imp.plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.title('Top Feature Importances from Random Forest')
plt.tight_layout()
plt.show()

