In [2]:
# DRW - Crypto Market Prediction Analysis
# Import necessary libraries

import pandas as pd
import numpy as np
import polars as pl
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print("Ready to analyze DRW Crypto Market Prediction data...")


Libraries imported successfully!
Ready to analyze DRW Crypto Market Prediction data...


In [3]:
# Load and examine the crypto market data
# First, let's identify the main data files
data_dir = "./data/"
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print(f"Found CSV files: {csv_files}")

# Load the main dataset(s)
train_data = None
test_data = None

# Try to identify train and test files
for file in csv_files:
    if 'train' in file.lower():
        print(f"\nLoading training data from: {file}")
        train_data = pd.read_csv(os.path.join(data_dir, file))
        print(f"Training data shape: {train_data.shape}")
    elif 'test' in file.lower():
        print(f"\nLoading test data from: {file}")
        test_data = pd.read_csv(os.path.join(data_dir, file))
        print(f"Test data shape: {test_data.shape}")

# If no train/test files found, load all CSV files
if train_data is None and csv_files:
    print(f"\nNo train file found. Loading first available CSV file: {csv_files[0]}")
    train_data = pd.read_csv(os.path.join(data_dir, csv_files[0]))
    print(f"Data shape: {train_data.shape}")

# Display basic information about the dataset
if train_data is not None:
    print("\n" + "="*60)
    print("CRYPTO MARKET DATASET OVERVIEW")
    print("="*60)
    print(f"Dataset shape: {train_data.shape}")
    print(f"Number of rows: {train_data.shape[0]:,}")
    print(f"Number of columns: {train_data.shape[1]:,}")
    
    print("\nColumn names:")
    for i, col in enumerate(train_data.columns):
        print(f"  {i+1:2d}. {col}")
        
    print("\nData types:")
    print(train_data.dtypes.value_counts())
    
    print("\nMemory usage:")
    print(f"Total memory usage: {train_data.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
else:
    print("No data loaded. Please check if the data files exist.")


Found CSV files: ['sample_submission.csv']

No train file found. Loading first available CSV file: sample_submission.csv
Data shape: (538150, 2)

CRYPTO MARKET DATASET OVERVIEW
Dataset shape: (538150, 2)
Number of rows: 538,150
Number of columns: 2

Column names:
   1. ID
   2. prediction

Data types:
int64      1
float64    1
Name: count, dtype: int64

Memory usage:
Total memory usage: 8.21 MB


In [4]:
# Display the head of the dataset and examine structure
if train_data is not None:
    print("="*60)
    print("DATA HEAD - First 5 rows")
    print("="*60)
    display(train_data.head())
    
    print("\n" + "="*60)
    print("DATA TAIL - Last 5 rows")
    print("="*60)
    display(train_data.tail())
    
    print("\n" + "="*60)
    print("DATA SAMPLE - Random 5 rows")
    print("="*60)
    display(train_data.sample(5))
    
    print("\n" + "="*60)
    print("DETAILED COLUMN INFORMATION")
    print("="*60)
    print(train_data.info())
    
    print("\n" + "="*60)
    print("BASIC STATISTICS")
    print("="*60)
    display(train_data.describe())
    
    print("\n" + "="*60)
    print("MISSING VALUES ANALYSIS")
    print("="*60)
    missing_values = train_data.isnull().sum()
    missing_percent = (missing_values / len(train_data)) * 100
    missing_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing_Count': missing_values.values,
        'Missing_Percent': missing_percent.values
    })
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
    
    if not missing_df.empty:
        print("Columns with missing values:")
        display(missing_df)
    else:
        print("✓ No missing values found!")
        
    print("\n" + "="*60)
    print("UNIQUE VALUES PER COLUMN")
    print("="*60)
    unique_counts = train_data.nunique().sort_values(ascending=False)
    print(unique_counts)
    
    # Check for potential datetime columns
    print("\n" + "="*60)
    print("POTENTIAL DATETIME COLUMNS")
    print("="*60)
    potential_datetime_cols = []
    for col in train_data.columns:
        if train_data[col].dtype == 'object':
            sample_values = train_data[col].dropna().head(3).tolist()
            print(f"{col}: {sample_values}")
            if any(keyword in col.lower() for keyword in ['time', 'date', 'timestamp']):
                potential_datetime_cols.append(col)
    
    if potential_datetime_cols:
        print(f"\nPotential datetime columns: {potential_datetime_cols}")
    
    # Check for potential categorical columns
    print("\n" + "="*60)
    print("POTENTIAL CATEGORICAL COLUMNS")
    print("="*60)
    categorical_cols = train_data.select_dtypes(include=['object']).columns
    
    if len(categorical_cols) > 0:
        print(f"Object columns found: {list(categorical_cols)}")
        for col in categorical_cols:
            unique_values = train_data[col].nunique()
            print(f"\n{col}:")
            print(f"  Unique values: {unique_values}")
            if unique_values <= 20:
                print(f"  Value counts: {train_data[col].value_counts().head(10).to_dict()}")
    else:
        print("No object/categorical columns found")
        
else:
    print("No data to display. Please check data loading.")


DATA HEAD - First 5 rows


Unnamed: 0,ID,prediction
0,1,-0.280233
1,2,1.371969
2,3,-2.045252
3,4,-1.447555
4,5,-1.303901



DATA TAIL - Last 5 rows


Unnamed: 0,ID,prediction
538145,538146,0.299954
538146,538147,0.441908
538147,538148,-0.681944
538148,538149,-1.019401
538149,538150,-0.162219



DATA SAMPLE - Random 5 rows


Unnamed: 0,ID,prediction
273387,273388,-1.090379
206284,206285,1.498398
360706,360707,1.578854
368277,368278,0.544395
355168,355169,-0.815276



DETAILED COLUMN INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538150 entries, 0 to 538149
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          538150 non-null  int64  
 1   prediction  538150 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 8.2 MB
None

BASIC STATISTICS


Unnamed: 0,ID,prediction
count,538150.0,538150.0
mean,269075.5,-0.001083
std,155350.66802,0.999291
min,1.0,-4.58191
25%,134538.25,-0.677657
50%,269075.5,0.00041
75%,403612.75,0.67295
max,538150.0,4.939038



MISSING VALUES ANALYSIS
✓ No missing values found!

UNIQUE VALUES PER COLUMN
ID            538150
prediction    538150
dtype: int64

POTENTIAL DATETIME COLUMNS

POTENTIAL CATEGORICAL COLUMNS
No object/categorical columns found


# 🔍 Data Exploration Complete - Ready for Next Steps!

## Summary
We have successfully loaded and examined the DRW Crypto Market Prediction dataset. The initial exploration gives us a good understanding of the data structure, types, and quality.

### 1. **Time Series Analysis** 🕐
- Convert timestamp columns to datetime format
- Analyze temporal patterns and trends
- Check for seasonality in crypto prices
- Examine trading volume patterns over time

### 2. **Feature Engineering** 🔧
- Create technical indicators (RSI, MACD, Bollinger Bands)
- Calculate moving averages (SMA, EMA)
- Generate price change ratios and volatility measures
- Create lag features for time series prediction

### 3. **Correlation Analysis** 📈
- Examine correlations between different cryptocurrencies
- Analyze relationships between price and volume
- Study correlation with market indicators
- Identify potential multicollinearity issues

### 4. **Visualization & EDA** 📊
- Plot price trends over time
- Create candlestick charts
- Volume analysis charts
- Distribution plots for key features

### 5. **Data Preprocessing** 🧹
- Handle missing values (if any)
- Normalize/scale features
- Create train/validation/test splits
- Address any data quality issues

### 6. **Model Selection** 🤖
- Time series models (ARIMA, Prophet)
- Machine learning models (Random Forest, XGBoost)
- Deep learning models (LSTM, GRU)
- Ensemble methods

### 7. **Evaluation Strategy** 📏
- Define appropriate metrics for crypto prediction
- Set up cross-validation for time series
- Create backtesting framework
- Risk-adjusted performance metrics

---

**💡 Recommendation**: Start with **Time Series Analysis** to understand the temporal nature of the data, then move to **Feature Engineering** to create predictive features before building models.
