# Data Detective Platform - Exploratory Analysis

This notebook demonstrates the key features of the Data Detective Platform, showcasing data analysis, anomaly detection, and debugging capabilities.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
import networkx as nx

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Initial Exploration

In [None]:
# Load sample datasets
employee_df = pd.read_csv('../data/sample_employee_data.csv')
transaction_df = pd.read_csv('../data/sample_transaction_data.csv')
anomaly_df = pd.read_csv('../data/sample_anomaly_data.csv')

print("Employee Data Shape:", employee_df.shape)
print("Transaction Data Shape:", transaction_df.shape)
print("Anomaly Data Shape:", anomaly_df.shape)

In [None]:
# Basic data exploration
print("\nEmployee Data Info:")
employee_df.info()

print("\nTransaction Data Info:")
transaction_df.info()

## 2. Data Quality Assessment (Sanity Checking)

In [None]:
# Check for missing values
print("Missing Values:")
print(employee_df.isnull().sum())
print("\n" + "="*50 + "\n")
print(transaction_df.isnull().sum())

In [None]:
# Check for duplicates
print("Duplicate Rows:")
print("Employees:", employee_df.duplicated().sum())
print("Transactions:", transaction_df.duplicated().sum())

In [None]:
# Statistical summary
print("\nEmployee Salary Statistics:")
print(employee_df['salary'].describe())

print("\nTransaction Amount Statistics:")
print(transaction_df['amount'].describe())

## 3. Anomaly Detection

In [None]:
# Prepare data for anomaly detection
numeric_cols = employee_df.select_dtypes(include=[np.number]).columns
X = employee_df[numeric_cols]

# Fit Isolation Forest
clf = IsolationForest(contamination=0.1, random_state=42)
employee_df['anomaly_score'] = clf.fit_predict(X)
employee_df['anomaly'] = employee_df['anomaly_score'] == -1

print("Anomalies detected:", employee_df['anomaly'].sum())
print("\nAnomalous records:")
employee_df[employee_df['anomaly']]

In [None]:
# Visualize anomalies
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(employee_df['age'], employee_df['salary'], 
           c=employee_df['anomaly'].map({True: 'red', False: 'blue'}),
           alpha=0.6)
plt.xlabel('Age')
plt.ylabel('Salary')
plt.title('Employee Data - Anomalies Highlighted')

plt.subplot(1, 2, 2)
anomaly_df['timestamp'] = pd.to_datetime(anomaly_df['timestamp'])
plt.plot(anomaly_df['timestamp'], anomaly_df['value'], 'b-', alpha=0.7)
anomalies = anomaly_df[abs(anomaly_df['value']) > 15]
plt.scatter(anomalies['timestamp'], anomalies['value'], color='red', s=50)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time Series Anomalies')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. Data Flow Visualization

In [None]:
# Create a sample data flow graph
G = nx.DiGraph()
G.add_edges_from([
    ('Data Source', 'ETL Process'),
    ('ETL Process', 'Data Warehouse'),
    ('Data Warehouse', 'Analytics'),
    ('Analytics', 'Reports'),
    ('Data Warehouse', 'API'),
    ('API', 'Applications')
])

# Visualize the graph
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', 
        node_size=2000, font_size=10, font_weight='bold',
        arrows=True, arrowsize=20, edge_color='gray')
plt.title('Sample Data Flow Architecture')
plt.show()

## 5. Transaction Analysis

In [None]:
# Analyze transaction patterns
transaction_df['timestamp'] = pd.to_datetime(transaction_df['timestamp'])
transaction_df['hour'] = transaction_df['timestamp'].dt.hour

# Check for suspicious transactions
suspicious = transaction_df[
    (transaction_df['amount'] > 1000) | 
    (transaction_df['amount'] < 0)
]

print("Suspicious Transactions:")
print(suspicious)

# Transaction volume by hour
plt.figure(figsize=(10, 6))
transaction_df.groupby('hour')['amount'].count().plot(kind='bar')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Transactions')
plt.title('Transaction Volume by Hour')
plt.show()

## 6. Log Analysis Simulation

In [None]:
# Simulate log analysis
log_data = """
2023-09-01 10:15:23 INFO User login successful
2023-09-01 10:16:45 WARNING Invalid password attempt
2023-09-01 10:17:12 INFO Data export completed
2023-09-01 10:18:33 ERROR Database connection failed
2023-09-01 10:19:01 INFO API request processed
2023-09-01 10:20:15 WARNING Rate limit exceeded
"""

log_lines = [line.strip() for line in log_data.split('\n') if line.strip()]
log_levels = [line.split()[2] for line in log_lines if len(line.split()) >= 3]

# Count log levels
from collections import Counter
level_counts = Counter(log_levels)

print("Log Level Distribution:")
for level, count in level_counts.items():
    print(f"{level}: {count}")

# Visualize log levels
plt.figure(figsize=(8, 6))
plt.bar(level_counts.keys(), level_counts.values())
plt.xlabel('Log Level')
plt.ylabel('Count')
plt.title('Log Level Distribution')
plt.show()

## Summary

This notebook demonstrates the core capabilities of the Data Detective Platform:

1. **Data Quality Assessment**: Checking for missing values, duplicates, and statistical anomalies
2. **Anomaly Detection**: Using machine learning to identify outliers in datasets
3. **Data Flow Visualization**: Mapping data movement through systems
4. **Transaction Analysis**: Identifying suspicious financial patterns
5. **Log Analysis**: Processing and analyzing system logs for issues

The platform integrates all these capabilities into an interactive web application for comprehensive data debugging and analysis.