In [5]:
# ============================================================================
# CELL 1: Setup and Imports
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("‚úÖ Libraries imported successfully!")
print("üìä Ready for data exploration")


NameError: name 'pd' is not defined

In [4]:
# ============================================================================
# CELL 2: Load Data
# ============================================================================

# Load the dataset
data_path = '/app/data/raw/phelps_et_al_2016.xlsx'

try:
    df = pd.read_excel(data_path)
    print(f"‚úÖ Data loaded successfully!")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
except FileNotFoundError:
    print("‚ùå Data file not found. Make sure your Excel file is in data/raw/")
    print("Current working directory contents:")
    print(list(Path('/app/data/raw/').glob('*')))
except Exception as e:
    print(f"‚ùå Error loading data: {e}")


‚ùå Error loading data: name 'pd' is not defined


In [3]:
# ============================================================================
# CELL 3: Quick Data Overview
# ============================================================================

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)

print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "=" * 40)
print("COLUMN NAMES")
print("=" * 40)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n" + "=" * 40)
print("DATA TYPES")
print("=" * 40)
print(df.dtypes.value_counts())

DATASET OVERVIEW


NameError: name 'df' is not defined

In [None]:
# ============================================================================
# CELL 4: First Look at Data
# ============================================================================

print("=" * 60)
print("FIRST 5 ROWS")
print("=" * 60)
display(df.head())

print("\n" + "=" * 60)
print("LAST 5 ROWS")
print("=" * 60)
display(df.tail())

print("\n" + "=" * 60)
print("RANDOM SAMPLE")
print("=" * 60)
display(df.sample(5))


In [None]:
# ============================================================================
# CELL 5: Data Quality Check
# ============================================================================

print("=" * 60)
print("DATA QUALITY ASSESSMENT")
print("=" * 60)

# Missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Count', ascending=False)

print("Missing Values Summary:")
display(missing_summary[missing_summary['Missing_Count'] > 0])

if missing_summary['Missing_Count'].sum() == 0:
    print("‚úÖ No missing values found!")

# Duplicates
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicate_count}")
if duplicate_count > 0:
    print("‚ö†Ô∏è Found duplicate rows - consider investigating")
else:
    print("‚úÖ No duplicate rows found")


In [None]:
# ============================================================================
# CELL 6: Descriptive Statistics
# ============================================================================

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("=" * 60)
print("VARIABLE TYPES")
print("=" * 60)
print(f"Numerical variables ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical variables ({len(categorical_cols)}): {categorical_cols}")

if numerical_cols:
    print("\n" + "=" * 60)
    print("NUMERICAL STATISTICS")
    print("=" * 60)
    display(df[numerical_cols].describe())

if categorical_cols:
    print("\n" + "=" * 60)
    print("CATEGORICAL STATISTICS")
    print("=" * 60)
    for col in categorical_cols[:5]:  # Show first 5 categorical columns
        print(f"\n--- {col} ---")
        print(f"Unique values: {df[col].nunique()}")
        if df[col].nunique() <= 20:  # Only show value counts if not too many unique values
            print("Value counts:")
            display(df[col].value_counts().head(10))
        else:
            print("Too many unique values to display (showing first 10 most common):")
            display(df[col].value_counts().head(10))


In [None]:
# ============================================================================
# CELL 7: Visualizations - Distribution Plots
# ============================================================================

if numerical_cols:
    print("Creating distribution plots for numerical variables...")
    
    # Calculate grid size
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 5*n_rows))
    
    # Handle single subplot case
    if len(numerical_cols) == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if len(numerical_cols) > 1 else [axes]
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            # Create histogram with KDE
            axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, edgecolor='black', density=True)
            
            # Add KDE curve
            from scipy import stats
            try:
                kde_data = df[col].dropna()
                if len(kde_data) > 1:
                    density = stats.gaussian_kde(kde_data)
                    xs = np.linspace(kde_data.min(), kde_data.max(), 100)
                    axes[i].plot(xs, density(xs), 'r-', linewidth=2)
            except:
                pass
            
            axes[i].set_title(f'Distribution of {col}', fontweight='bold')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Density')
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for j in range(len(numerical_cols), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================================
# CELL 8: Box Plots for Outlier Detection
# ============================================================================

if numerical_cols:
    print("Creating box plots for outlier detection...")
    
    # Calculate grid size
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 5*n_rows))
    
    # Handle single subplot case
    if len(numerical_cols) == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if len(numerical_cols) > 1 else [axes]
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            bp = axes[i].boxplot(df[col].dropna(), patch_artist=True)
            bp['boxes'][0].set_facecolor('lightblue')
            bp['boxes'][0].set_alpha(0.7)
            
            axes[i].set_title(f'Box Plot: {col}', fontweight='bold')
            axes[i].set_ylabel(col)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for j in range(len(numerical_cols), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()


In [None]:
# ============================================================================
# CELL 9: Correlation Analysis
# ============================================================================

if len(numerical_cols) > 1:
    print("=" * 60)
    print("CORRELATION ANALYSIS")
    print("=" * 60)
    
    # Calculate correlation matrix
    correlation_matrix = df[numerical_cols].corr()
    
    print("Correlation Matrix:")
    display(correlation_matrix.round(3))
    
    # Create correlation heatmap
    plt.figure(figsize=(12, 10))
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    
    # Generate heatmap
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                cmap='coolwarm', 
                center=0, 
                square=True, 
                fmt='.2f',
                cbar_kws={"shrink": .8})
    
    plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > 0.7:  # Threshold for high correlation
                high_corr_pairs.append((
                    correlation_matrix.columns[i], 
                    correlation_matrix.columns[j], 
                    corr_value
                ))
    
    if high_corr_pairs:
        print("\nüîç Highly Correlated Pairs (|r| > 0.7):")
        for col1, col2, corr in high_corr_pairs:
            print(f"  ‚Ä¢ {col1} ‚Üî {col2}: {corr:.3f}")
    else:
        print("\n‚úÖ No highly correlated pairs found (|r| > 0.7)")

In [None]:
# ============================================================================
# CELL 10: Categorical Data Visualization
# ============================================================================

if categorical_cols:
    print("Creating visualizations for categorical variables...")
    
    # Show first 3 categorical columns
    for col in categorical_cols[:3]:
        if df[col].nunique() <= 20:  # Only plot if not too many categories
            
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            
            # Bar plot
            value_counts = df[col].value_counts()
            value_counts.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
            ax1.set_title(f'Distribution of {col}', fontweight='bold')
            ax1.set_xlabel(col)
            ax1.set_ylabel('Count')
            ax1.tick_params(axis='x', rotation=45)
            ax1.grid(True, alpha=0.3)
            
            # Pie chart (only if <= 10 categories)
            if len(value_counts) <= 10:
                value_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
                ax2.set_title(f'Proportion of {col}', fontweight='bold')
                ax2.set_ylabel('')
            else:
                ax2.text(0.5, 0.5, f'Too many categories\nfor pie chart\n({len(value_counts)} unique values)', 
                        ha='center', va='center', transform=ax2.transAxes)
                ax2.set_xlim(0, 1)
                ax2.set_ylim(0, 1)
                ax2.set_xticks([])
                ax2.set_yticks([])
            
            plt.tight_layout()
            plt.show()

In [None]:
# ============================================================================
# CELL 11: AI Analysis (if Ollama is available)
# ============================================================================

try:
    # Test Ollama integration
    from src.ollama_helper import setup_ollama
    
    print("ü§ñ Connecting to AI assistant...")
    ai = setup_ollama("llama2")
    
    # Create dataset summary for AI
    summary = f"""
    Dataset Analysis Summary:
    - Shape: {df.shape[0]} rows, {df.shape[1]} columns
    - Numerical variables: {len(numerical_cols)} ({', '.join(numerical_cols[:5])})
    - Categorical variables: {len(categorical_cols)} ({', '.join(categorical_cols[:3])})
    - Missing values: {df.isnull().sum().sum()} total
    - Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB
    
    This appears to be research data from Phelps et al. 2016.
    """
    
    print("üß† Getting AI analysis suggestions...")
    response = ai.ask(f"""
    Based on this dataset summary, what are the most important next steps for analysis?
    
    {summary}
    
    Please provide 3-5 specific, actionable recommendations for data analysis.
    """)
    
    if response and response != "Error: Could not get response":
        print("=" * 60)
        print("ü§ñ AI ANALYSIS SUGGESTIONS")
        print("=" * 60)
        print(response)
    else:
        print("‚ö†Ô∏è AI assistant not available - continuing without AI suggestions")
        
except ImportError:
    print("‚ö†Ô∏è Ollama helper not found - run this in Docker for AI features")
except Exception as e:
    print(f"‚ö†Ô∏è AI integration error: {e}")

In [None]:
# ============================================================================
# CELL 12: Summary and Next Steps
# ============================================================================

print("\n" + "=" * 60)
print("üìä EXPLORATION SUMMARY")
print("=" * 60)

print(f"‚úÖ Dataset successfully loaded and explored")
print(f"üìà Found {len(numerical_cols)} numerical and {len(categorical_cols)} categorical variables")
print(f"üîç Data quality: {df.isnull().sum().sum()} missing values, {df.duplicated().sum()} duplicates")

if len(numerical_cols) > 1:
    high_corr_count = len([1 for i in range(len(correlation_matrix.columns)) 
                          for j in range(i+1, len(correlation_matrix.columns)) 
                          if abs(correlation_matrix.iloc[i, j]) > 0.7])
    print(f"üîó Found {high_corr_count} highly correlated variable pairs")

print("\n" + "=" * 40)
print("üéØ RECOMMENDED NEXT STEPS")
print("=" * 40)
print("1. üßπ Handle missing values and outliers")
print("2. üî¨ Perform statistical tests and hypothesis testing") 
print("3. ü§ñ Use AI assistant for advanced analysis suggestions")
print("4. üìä Create publication-ready visualizations")
print("5. üîç Investigate interesting patterns found in the data")
print("6. üìù Document findings and create analysis report")

print(f"\nüéâ Data exploration complete! Ready for advanced analysis.")
print(f"üí° Tip: Use the AI assistant to get specific analysis recommendations!")

In [None]:
# ============================================================================
# CELL 13: Quick Data Access Functions
# ============================================================================

def quick_summary():
    """Quick function to show dataset summary"""
    print(f"Dataset: {df.shape[0]} rows √ó {df.shape[1]} columns")
    print(f"Numerical: {len(numerical_cols)} variables")
    print(f"Categorical: {len(categorical_cols)} variables")
    print(f"Missing values: {df.isnull().sum().sum()}")
    return df.info()

def show_correlations(threshold=0.5):
    """Show correlations above threshold"""
    if len(numerical_cols) > 1:
        corr = df[numerical_cols].corr()
        high_corr = []
        for i in range(len(corr.columns)):
            for j in range(i+1, len(corr.columns)):
                if abs(corr.iloc[i, j]) > threshold:
                    high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i, j]))
        
        for col1, col2, corr_val in high_corr:
            print(f"{col1} ‚Üî {col2}: {corr_val:.3f}")
    else:
        print("Need at least 2 numerical columns for correlation analysis")

def plot_variable(column_name):
    """Quick plot for any variable"""
    if column_name in numerical_cols:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Histogram
        df[column_name].hist(bins=30, ax=ax1, alpha=0.7, edgecolor='black')
        ax1.set_title(f'Distribution of {column_name}')
        ax1.grid(True, alpha=0.3)
        
        # Box plot
        df[column_name].plot(kind='box', ax=ax2)
        ax2.set_title(f'Box Plot of {column_name}')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Summary stats
        print(f"\nSummary for {column_name}:")
        print(df[column_name].describe())
        
    elif column_name in categorical_cols:
        # Bar plot for categorical
        plt.figure(figsize=(10, 5))
        df[column_name].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title(f'Distribution of {column_name}')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nValue counts for {column_name}:")
        print(df[column_name].value_counts())
    else:
        print(f"Column '{column_name}' not found in dataset")

print("‚úÖ Helper functions defined:")
print("  ‚Ä¢ quick_summary() - Show dataset overview")
print("  ‚Ä¢ show_correlations(threshold=0.5) - Show correlated variables") 
print("  ‚Ä¢ plot_variable('column_name') - Quick plot any variable")
print("\nExample: plot_variable('your_column_name')")