In [None]:
# Jane Street Market Data Analysis
# Import necessary libraries

import pandas as pd
import numpy as np
import polars as pl
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Create data directory if it doesn't exist
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Created directory: {data_dir}")
else:
    print(f"Directory already exists: {data_dir}")

# Download the data using Kaggle CLI
# Note: Make sure you have kaggle CLI installed and configured with your API key
import subprocess

try:
    # Download the competition data
    print("Downloading Jane Street Market Data...")
    result = subprocess.run(
        ["kaggle", "competitions", "download", "-c", "jane-street-real-time-market-data-forecasting", "-p", data_dir],
        capture_output=True, text=True
    )
    
    if result.returncode == 0:
        print("Data downloaded successfully!")
        print(f"Files saved to: {data_dir}")
    else:
        print(f"Error downloading data: {result.stderr}")
        print("Please make sure you have:")
        print("1. Kaggle CLI installed (pip install kaggle)")
        print("2. API key configured (~/.kaggle/kaggle.json)")
        print("3. Accepted the competition rules")
except Exception as e:
    print(f"Error: {e}")
    print("Please run the following command in terminal:")
    print("kaggle competitions download -c jane-street-real-time-market-data-forecasting -p data")


In [None]:
# Extract the data from zip file
zip_path = os.path.join(data_dir, "jane-street-real-time-market-data-forecasting.zip")

if os.path.exists(zip_path):
    print("Extracting data from zip file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    print("Data extracted successfully!")
else:
    print(f"Zip file not found at: {zip_path}")
    print("Please ensure the data has been downloaded first.")

# List all files in the data directory
print("\nFiles in data directory:")
if os.path.exists(data_dir):
    for file in os.listdir(data_dir):
        file_path = os.path.join(data_dir, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
            print(f"  {file} ({size:.2f} MB)")
else:
    print("Data directory not found.")


In [None]:
# Load and explore the data
# First, let's identify the main data files

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print(f"Found CSV files: {csv_files}")

# Load the main training data (assuming it's named something like train.csv or similar)
train_data = None
test_data = None

for file in csv_files:
    if 'train' in file.lower():
        print(f"\nLoading training data from: {file}")
        train_data = pd.read_csv(os.path.join(data_dir, file))
        print(f"Training data shape: {train_data.shape}")
        break

# If no train file found, load the first CSV file
if train_data is None and csv_files:
    print(f"\nLoading first available CSV file: {csv_files[0]}")
    train_data = pd.read_csv(os.path.join(data_dir, csv_files[0]))
    print(f"Data shape: {train_data.shape}")

# Display basic information about the dataset
if train_data is not None:
    print("\n" + "="*50)
    print("DATASET OVERVIEW")
    print("="*50)
    print(f"Dataset shape: {train_data.shape}")
    print(f"Number of rows: {train_data.shape[0]:,}")
    print(f"Number of columns: {train_data.shape[1]:,}")
    
    print("\nColumn names:")
    for i, col in enumerate(train_data.columns):
        print(f"  {i+1:2d}. {col}")
        
    print("\nData types:")
    print(train_data.dtypes.value_counts())
    
    print("\nMemory usage:")
    print(f"Total memory usage: {train_data.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
else:
    print("No data loaded. Please check if the data files exist.")


In [None]:
# Display the head of the dataset
if train_data is not None:
    print("="*50)
    print("DATA HEAD (First 5 rows)")
    print("="*50)
    display(train_data.head())
    
    print("\n" + "="*50)
    print("DATA TAIL (Last 5 rows)")
    print("="*50)
    display(train_data.tail())
    
    print("\n" + "="*50)
    print("BASIC STATISTICS")
    print("="*50)
    display(train_data.describe())
    
    print("\n" + "="*50)
    print("MISSING VALUES")
    print("="*50)
    missing_values = train_data.isnull().sum()
    missing_percent = (missing_values / len(train_data)) * 100
    missing_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing_Count': missing_values.values,
        'Missing_Percent': missing_percent.values
    })
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
    
    if not missing_df.empty:
        display(missing_df)
    else:
        print("No missing values found!")
        
    print("\n" + "="*50)
    print("SAMPLE OF RANDOM ROWS")
    print("="*50)
    display(train_data.sample(5))
else:
    print("No data to display.")


In [None]:
# Correlation Analysis
if train_data is not None:
    print("="*50)
    print("CORRELATION ANALYSIS")
    print("="*50)
    
    # Select only numeric columns for correlation
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns
    print(f"Found {len(numeric_cols)} numeric columns for correlation analysis")
    
    if len(numeric_cols) > 1:
        # Calculate correlation matrix
        corr_matrix = train_data[numeric_cols].corr()
        
        # Display correlation matrix
        print("\nCorrelation Matrix (top 10x10):")
        display(corr_matrix.iloc[:10, :10])
        
        # Find highly correlated pairs
        print("\nHighly Correlated Features (|correlation| > 0.7):")
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:
                    high_corr_pairs.append((
                        corr_matrix.columns[i], 
                        corr_matrix.columns[j], 
                        corr_val
                    ))
        
        if high_corr_pairs:
            high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation'])
            high_corr_df = high_corr_df.sort_values('Correlation', key=abs, ascending=False)
            display(high_corr_df.head(20))
        else:
            print("No highly correlated features found (|correlation| > 0.7)")
            
        # Create correlation heatmap
        plt.figure(figsize=(12, 10))
        
        # If too many features, show only the first 20x20
        if len(numeric_cols) > 20:
            corr_subset = corr_matrix.iloc[:20, :20]
            title = "Correlation Heatmap (First 20 features)"
        else:
            corr_subset = corr_matrix
            title = "Correlation Heatmap"
            
        sns.heatmap(corr_subset, annot=True, cmap='coolwarm', center=0, 
                   square=True, fmt='.2f', cbar_kws={"shrink": .8})
        plt.title(title)
        plt.tight_layout()
        plt.show()
        
        # Show correlation with target variable if exists
        target_candidates = ['target', 'label', 'y', 'price', 'return']
        target_col = None
        for col in target_candidates:
            if col in train_data.columns:
                target_col = col
                break
                
        if target_col:
            print(f"\nCorrelation with target variable '{target_col}':")
            target_corr = train_data[numeric_cols].corrwith(train_data[target_col]).sort_values(key=abs, ascending=False)
            display(target_corr.head(10))
        else:
            print("\nNo obvious target variable found. Looking for potential targets...")
            # Show correlation with any column that might be a target
            for col in numeric_cols:
                if any(keyword in col.lower() for keyword in ['target', 'label', 'y', 'price', 'return']):
                    print(f"\nCorrelation with '{col}':")
                    col_corr = train_data[numeric_cols].corrwith(train_data[col]).sort_values(key=abs, ascending=False)
                    display(col_corr.head(10))
                    break
    else:
        print("Not enough numeric columns for correlation analysis")
else:
    print("No data available for correlation analysis")


In [None]:
# Additional Exploratory Data Analysis
if train_data is not None:
    print("="*50)
    print("ADDITIONAL EXPLORATORY ANALYSIS")
    print("="*50)
    
    # Distribution of numeric features
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        print(f"\nPlotting distributions for first 6 numeric features...")
        
        # Create distribution plots
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.ravel()
        
        for i, col in enumerate(numeric_cols[:6]):
            train_data[col].hist(bins=30, ax=axes[i], alpha=0.7)
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
        
        # Hide empty subplots
        for i in range(len(numeric_cols[:6]), 6):
            axes[i].set_visible(False)
            
        plt.tight_layout()
        plt.show()
    
    # Check for categorical columns
    categorical_cols = train_data.select_dtypes(include=['object']).columns
    
    if len(categorical_cols) > 0:
        print(f"\nFound {len(categorical_cols)} categorical columns:")
        for col in categorical_cols:
            unique_values = train_data[col].nunique()
            print(f"  {col}: {unique_values} unique values")
            if unique_values <= 10:
                print(f"    Values: {train_data[col].value_counts().head().to_dict()}")
    
    # Data quality checks
    print("\n" + "="*30)
    print("DATA QUALITY CHECKS")
    print("="*30)
    
    # Check for duplicates
    duplicates = train_data.duplicated().sum()
    print(f"Duplicate rows: {duplicates}")
    
    # Check for constant columns
    constant_cols = []
    for col in train_data.columns:
        if train_data[col].nunique() == 1:
            constant_cols.append(col)
    
    if constant_cols:
        print(f"Constant columns (same value for all rows): {constant_cols}")
    else:
        print("No constant columns found")
    
    # Memory usage by column
    print("\nTop 10 columns by memory usage:")
    memory_usage = train_data.memory_usage(deep=True).sort_values(ascending=False)
    for col, usage in memory_usage.head(10).items():
        print(f"  {col}: {usage / (1024**2):.2f} MB")
        
    print("\n" + "="*50)
    print("READY FOR FURTHER ANALYSIS!")
    print("="*50)
    print("Data loaded successfully. You can now proceed with:")
    print("1. Feature engineering")
    print("2. Model building")
    print("3. Cross-validation")
    print("4. Predictions")
else:
    print("No data available for additional analysis")
