In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [6]:
df = pd.read_csv('creditcard.csv')

# Inspect the first few rows of the dataset
print(df.head())

# Step 2: Preprocess the data
# Let's assume 'Risk' is our target, and all other columns are features.
# Check for missing values and handle them (e.g., drop or fill with median)
df.fillna(df.median(), inplace=True)

# Encode categorical variables if any exist
df = pd.get_dummies(df, drop_first=True)

# Separate features and target variable
X = df.drop('Risk', axis=1)  # Features (assuming 'Risk' is the target column)
y = df['Risk']  # Target ('Low-risk' or 'High-risk')

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Feature scaling (standardize the features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

# (Optional) Step 6: Fine-tune the model or try other algorithms (e.g., Random Forest, XGBoost)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
