In [6]:
# ============================================================================
# CELL 1: Setup and Imports
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Libraries imported successfully!")
print("📊 Ready for data exploration")


✅ Libraries imported successfully!
📊 Ready for data exploration


In [7]:
# ============================================================================
# CELL 2: Load Data
# ============================================================================

# Load the dataset
data_path = '/app/data/raw/phelps_et_al_2016.xlsx'

try:
    df = pd.read_excel(data_path)
    print(f"✅ Data loaded successfully!")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
except FileNotFoundError:
    print("❌ Data file not found. Make sure your Excel file is in data/raw/")
    print("Current working directory contents:")
    print(list(Path('/app/data/raw/').glob('*')))
except Exception as e:
    print(f"❌ Error loading data: {e}")


✅ Data loaded successfully!
Dataset shape: (139, 59)
Columns: 59


In [9]:
# ============================================================================
# CELL 3: Quick Data Overview
# ============================================================================

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)

print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "=" * 40)
print("COLUMN NAMES")
print("=" * 40)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n" + "=" * 40)
print("DATA TYPES")
print("=" * 40)
print(df.dtypes.value_counts())

DATASET OVERVIEW
Shape: 139 rows × 59 columns
Memory usage: 0.27 MB

COLUMN NAMES
 1. Unnamed: 0
 2. Unnamed: 1
 3. Major and minor elements as wt %
 4. Unnamed: 3
 5. Unnamed: 4
 6. Unnamed: 5
 7. Unnamed: 6
 8. Unnamed: 7
 9. Unnamed: 8
10. Unnamed: 9
11. Unnamed: 10
12. Unnamed: 11
13. Trace elements as ppm
14. Unnamed: 13
15. Unnamed: 14
16. Unnamed: 15
17. Unnamed: 16
18. Unnamed: 17
19. Unnamed: 18
20. Unnamed: 19
21. Unnamed: 20
22. Unnamed: 21
23. Unnamed: 22
24. Unnamed: 23
25. Unnamed: 24
26. Unnamed: 25
27. Unnamed: 26
28. Unnamed: 27
29. Unnamed: 28
30. Unnamed: 29
31. Unnamed: 30
32. Unnamed: 31
33. Unnamed: 32
34. Unnamed: 33
35. Unnamed: 34
36. Unnamed: 35
37. Unnamed: 36
38. Unnamed: 37
39. Unnamed: 38
40. Unnamed: 39
41. Unnamed: 40
42. Unnamed: 41
43. Unnamed: 42
44. Unnamed: 43
45. Unnamed: 44
46. Unnamed: 45
47. Unnamed: 46
48. Unnamed: 47
49. Unnamed: 48
50. Unnamed: 49
51. Unnamed: 50
52. Unnamed: 51
53. Unnamed: 52
54. Unnamed: 53
55. Unnamed: 54
56. Unnamed: 55


In [11]:
# ============================================================================
# CELL 4: First Look at Data
# ============================================================================

print("=" * 60)
print("FIRST 5 ROWS")
print("=" * 60)
display(df.head())

print("\n" + "=" * 60)
print("LAST 5 ROWS")
print("=" * 60)
display(df.tail())

print("\n" + "=" * 60)
print("RANDOM SAMPLE")
print("=" * 60)
display(df.sample(5))


FIRST 5 ROWS


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Major and minor elements as wt %,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Trace elements as ppm,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
0,N-1 Apollonia-type,,Na2O,MgO,Al2O3,SiO2,P2O5,Cl,K2O,CaO,TiO2,Fe2O3,Li2O,B2O3,V2O5,Cr2O3,MnO,CoO,NiO,CuO,ZnO,GaO,As2O3,Rb2O,SrO,Y2O3,ZrO2,Nb2O3,MoO,Ag,Cd,In,SnO2,Sb2O3,Cs2O,BaO,La2O3,CeO2,PrO2,Nd2O3,Sm2O3,Eu2O3,Gd2O3,Tb2O3,Dy2O3,Ho2O3,Er2O3,Tm2O3,Yb2O3,Lu2O3,HfO2,Ta2O3,WO,Pt,Au,PbO,Bi,ThO2,UO2
1,AH 3746 07,N-1,13.358034,0.696979,3.160218,72.20451,0.146365,0.911894,0.545308,8.298218,0.078151,0.460747,3.201652,241.296722,17.062143,21.850887,168.926657,bdl,5.765742,5.488999,11.389127,3.242035,2.163488,8.084935,510.953597,9.128406,52.646892,1.819799,0.527319,bdl,0.035423,bdl,14.236249,2.749665,bdl,268.177629,7.967658,14.226662,1.83029,7.604406,1.656862,0.384665,1.446338,0.228261,1.459441,0.300216,0.750546,0.106976,0.721028,0.112006,1.24888,0.114674,0.078566,bdl,bdl,4.999577,bdl,1.036871,0.743249
2,AY 2844 05,N-1,15.606332,0.620495,3.17023,70.024774,0.059676,0.910235,0.842892,8.1754,0.061156,0.372845,5.600347,360.141414,12.08553,15.569659,220.038314,1.469219,4.077484,21.778212,9.46052,4.351242,0.532089,13.026409,538.027299,8.585674,46.845947,1.574386,0.888948,0.034791,0.007722,bdl,1.223542,0.573073,0.001259,247.557079,6.943084,13.735333,1.75234,7.126564,1.451786,0.432386,1.411467,0.207732,1.224803,0.254821,0.701286,0.094865,0.701644,0.093889,1.006941,0.086256,0.064416,bdl,bdl,7.441392,bdl,0.853437,0.621768
3,BSH 2885 06,N-1,13.36925,0.669362,2.909701,71.720271,0.183773,0.869842,0.854212,8.694513,0.08444,0.488208,bdl,274.858863,16.697977,26.577672,256.648287,3.836216,5.70565,22.921467,18.077793,4.012232,5.387377,10.946096,512.140306,8.708609,58.542507,2.076806,0.544578,0.161294,0.015522,,12.733951,2.729695,0.038381,233.803277,7.779752,14.791908,1.844127,7.586739,1.506833,0.369467,0.976995,0.219875,1.37614,0.289004,0.750797,0.098386,0.755826,0.100628,1.296362,0.111495,0.083269,bdl,0.009737,45.487579,0.013652,1.011739,0.63834
4,CEA W2S3 01,N-1,17.631055,0.678295,3.196461,66.299834,0.149532,0.989511,0.997076,9.407947,0.066927,0.423264,7.974045,333.126581,13.768782,15.411259,226.575084,3.481904,5.242084,32.561716,10.195833,4.309162,2.919409,13.02872,546.593636,8.817511,49.769999,1.760621,0.498124,0.053004,0.018481,0.008339,1.412118,0.395577,0.090738,253.44607,7.276438,14.61629,1.793779,7.364463,1.550596,0.465141,1.465733,0.209276,1.250613,0.265947,0.700858,0.09754,0.70866,0.096028,1.097847,0.097173,0.046694,0.003722,bdl,28.853129,0.009216,0.915964,0.634884



LAST 5 ROWS


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Major and minor elements as wt %,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Trace elements as ppm,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
134,NS 6362 04,Outlier,15.296345,0.720333,3.280446,68.458214,0.126589,0.695559,0.553166,9.450209,0.144895,0.875467,13.594119,206.796453,28.404989,35.879259,684.646966,8.527522,10.674284,331.655228,56.869955,4.825193,3.953759,7.622124,418.351881,9.365657,93.598594,3.328008,0.618424,0.428849,0.028414,0.445815,140.023092,13.474392,0.117443,229.615839,9.10713,17.976427,2.091546,8.188089,1.554293,0.456353,1.378536,0.236361,1.309711,0.291056,0.734436,0.10656,0.77164,0.12548,1.892467,0.157496,0.173718,bdl,0.171277,1635.451215,0.258064,1.368741,1.136578
135,NS 6362 10,Outlier,14.544896,0.668266,3.380302,70.597159,0.093951,0.68122,0.469424,8.20455,0.169344,0.907893,14.079003,197.825231,30.113673,30.601354,306.38217,5.395302,8.697162,52.221105,27.292905,4.917015,2.38734,7.734095,371.508085,9.693987,107.491858,3.779707,0.432206,0.201326,0.031532,0.073997,20.379382,5.149246,0.114653,221.419487,9.69106,19.317459,2.242119,8.573691,1.658837,0.460324,1.477648,0.242766,1.380809,0.298232,0.771951,0.105452,0.787315,0.117944,2.143149,0.176685,0.111135,0.001176,bdl,1349.455844,0.406918,1.539191,1.077818
136,RAM 5947 22,Outlier,15.804857,0.66462,2.799984,68.009085,0.148708,0.847578,0.517627,9.395691,0.184701,0.88782,0.198502,255.174368,32.568846,155.940209,1734.625264,17.66536,13.342814,589.448957,91.012384,3.873733,12.665225,7.603415,375.277511,8.795161,152.174535,3.396066,0.822236,1.055249,0.136466,,317.829206,24.7388,0.063843,233.314396,8.457715,16.26771,2.064515,8.071677,1.674072,0.443504,1.302067,0.241067,1.37792,0.286929,0.821661,0.119004,0.783989,0.132497,3.74797,0.1878,0.266321,0.010056,0.389255,3310.624097,0.475215,1.570732,1.242269
137,RAM 5947 03,Outlier,15.594046,0.588171,3.38299,69.729226,0.111766,0.968687,0.318891,7.931244,0.188836,0.946888,9.873233,211.041973,32.345516,35.608908,613.202148,7.157481,9.379853,81.659864,44.180602,4.818541,2.933938,6.251058,346.54844,9.657332,131.573512,4.00877,0.250212,0.180799,0.044892,,42.510326,3.873316,0.076367,213.578529,9.688883,18.764465,2.277961,9.000912,1.784892,0.471268,1.578708,0.255994,1.518883,0.321543,0.903986,0.133033,0.953461,0.130855,3.023667,0.228139,0.101283,0.005359,0.027975,527.404596,0.086669,1.778548,1.243279
138,"† analysis A of vessel body, analysis B of tra...",,,,,,,bdl = below detection limit,,,,,n/a = data not available,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,



RANDOM SAMPLE


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Major and minor elements as wt %,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Trace elements as ppm,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
0,N-1 Apollonia-type,,Na2O,MgO,Al2O3,SiO2,P2O5,Cl,K2O,CaO,TiO2,Fe2O3,Li2O,B2O3,V2O5,Cr2O3,MnO,CoO,NiO,CuO,ZnO,GaO,As2O3,Rb2O,SrO,Y2O3,ZrO2,Nb2O3,MoO,Ag,Cd,In,SnO2,Sb2O3,Cs2O,BaO,La2O3,CeO2,PrO2,Nd2O3,Sm2O3,Eu2O3,Gd2O3,Tb2O3,Dy2O3,Ho2O3,Er2O3,Tm2O3,Yb2O3,Lu2O3,HfO2,Ta2O3,WO,Pt,Au,PbO,Bi,ThO2,UO2
97,RAM 4740 09,N-3,14.223739,0.487327,2.258602,69.38815,0.092891,1.220892,0.216425,10.855189,0.278296,0.845331,0,273.869233,31.757819,33.299105,230.115523,3.221981,6.622318,3.695838,16.736878,3.325665,2.282771,4.030072,185.608313,8.5624,282.263945,4.835714,0.037042,0.049703,0.034523,,22.89399,0.405443,0.018192,147.324309,8.409129,16.176696,2.014209,7.957601,1.613501,0.340707,1.340738,0.215818,1.348871,0.284618,0.825384,0.12132,0.886776,0.135931,6.147395,0.272278,0.050683,bdl,bdl,19.372443,0.017809,1.961005,0.976061
26,JER 5124 21,N-1,15.358726,0.456536,2.974978,71.950225,0.05389,0.884152,0.565258,7.131955,0.078078,0.413319,11.27783,150.098544,14.549465,103.463736,179.345014,1.581025,4.178157,4.831797,9.475054,4.071499,1.529121,10.779796,453.352393,8.779157,60.563158,2.066699,0.291298,0.028843,bdl,0.006036,0.837102,bdl,0.090676,259.790862,8.375089,16.200727,1.864072,7.255208,1.38047,0.426083,1.14291,0.209834,1.199628,0.258383,0.693182,0.098307,0.640943,0.088706,1.217333,0.104905,0.070528,bdl,bdl,4.847183,0.002213,1.014278,0.732422
129,Egypt I,,Na2O,Mg O,Al2O3,SiO2,P2O5,Cl,K2O,CaO,TiO2,Fe2O3,Li2O,B2O3,V2O5,Cr2O3,MnO,CoO,NiO,CuO,ZnO,GaO,As2O3,Rb2O,SrO,Y2O3,ZrO2,Nb2O3,MoO,Ag,Cd,In,SnO2,Sb2O3,Cs2O,BaO,La2O3,CeO2,PrO2,Nd2O3,Sm2O3,Eu2O3,Gd2O3,Tb2O3,Dy2O3,Ho2O3,Er2O3,Tm2O3,Yb2O3,Lu2O3,HfO2,Ta2O3,WO,Pt,Au,PbO,Bi,ThO2,UO2
25,JER 5124 20,N-1,14.263329,0.512685,3.025518,72.881051,0.045342,0.897863,0.44672,7.30877,0.074156,0.408633,9.535747,117.8838,13.268211,134.245821,177.190293,1.56547,4.099256,5.219272,7.366456,4.134444,3.3308,9.245338,480.427143,8.980667,57.810478,2.011777,0.135175,0.031783,bdl,0.018499,0.694157,bdl,0.077229,254.973149,8.655556,16.408649,1.883572,7.169489,1.36895,0.466987,1.174903,0.217925,1.209719,0.264278,0.674508,0.096988,0.581966,0.098469,1.184283,0.098005,0.050988,bdl,bdl,23.565389,0.002025,0.986764,0.93139


In [12]:
# ============================================================================
# CELL 5: Data Quality Check
# ============================================================================

print("=" * 60)
print("DATA QUALITY ASSESSMENT")
print("=" * 60)

# Missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Count', ascending=False)

print("Missing Values Summary:")
display(missing_summary[missing_summary['Missing_Count'] > 0])

if missing_summary['Missing_Count'].sum() == 0:
    print("✅ No missing values found!")

# Duplicates
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicate_count}")
if duplicate_count > 0:
    print("⚠️ Found duplicate rows - consider investigating")
else:
    print("✅ No duplicate rows found")


DATA QUALITY ASSESSMENT
Missing Values Summary:


Unnamed: 0,Column,Missing_Count,Missing_Percentage
31,Unnamed: 31,58,41.726619
15,Unnamed: 15,24,17.266187
1,Unnamed: 1,6,4.316547
8,Unnamed: 8,1,0.719424
4,Unnamed: 4,1,0.719424
5,Unnamed: 5,1,0.719424
6,Unnamed: 6,1,0.719424
10,Unnamed: 10,1,0.719424
9,Unnamed: 9,1,0.719424
11,Unnamed: 11,1,0.719424



Duplicate rows: 0
✅ No duplicate rows found


In [13]:
# ============================================================================
# CELL 6: Descriptive Statistics
# ============================================================================

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("=" * 60)
print("VARIABLE TYPES")
print("=" * 60)
print(f"Numerical variables ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical variables ({len(categorical_cols)}): {categorical_cols}")

if numerical_cols:
    print("\n" + "=" * 60)
    print("NUMERICAL STATISTICS")
    print("=" * 60)
    display(df[numerical_cols].describe())

if categorical_cols:
    print("\n" + "=" * 60)
    print("CATEGORICAL STATISTICS")
    print("=" * 60)
    for col in categorical_cols[:5]:  # Show first 5 categorical columns
        print(f"\n--- {col} ---")
        print(f"Unique values: {df[col].nunique()}")
        if df[col].nunique() <= 20:  # Only show value counts if not too many unique values
            print("Value counts:")
            display(df[col].value_counts().head(10))
        else:
            print("Too many unique values to display (showing first 10 most common):")
            display(df[col].value_counts().head(10))


VARIABLE TYPES
Numerical variables (0): []
Categorical variables (59): ['Unnamed: 0', 'Unnamed: 1', 'Major and minor elements as wt %', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Trace elements as ppm', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58']

CATEGORICAL STATISTI

Unnamed: 0
N-1 Apollonia-type    1
AH 3746 07            1
AY 2844 05            1
BSH 2885 06           1
CEA W2S3 01           1
CEA W2S3 02           1
JER 3835 01           1
JER 3835 02           1
JER 3835 03           1
JER 3835 04           1
Name: count, dtype: int64


--- Unnamed: 1 ---
Unique values: 7
Value counts:


Unnamed: 1
N-1         54
N-3         50
N-2         17
Outlier      5
N-3 (Co)     3
N-3 (Mn)     2
N-4          2
Name: count, dtype: int64


--- Major and minor elements as wt % ---
Unique values: 134
Too many unique values to display (showing first 10 most common):


Major and minor elements as wt %
Na2O         5
13.358034    1
15.606332    1
13.36925     1
17.631055    1
13.097712    1
12.288159    1
12.991899    1
13.56667     1
15.134331    1
Name: count, dtype: int64


--- Unnamed: 3 ---
Unique values: 135
Too many unique values to display (showing first 10 most common):


Unnamed: 3
MgO         3
Mg O        2
0.696979    1
0.669362    1
0.678295    1
0.656585    1
0.620495    1
0.56618     1
0.587784    1
0.573126    1
Name: count, dtype: int64


--- Unnamed: 4 ---
Unique values: 134
Too many unique values to display (showing first 10 most common):


Unnamed: 4
Al2O3       5
3.160218    1
3.17023     1
2.909701    1
3.196461    1
3.068191    1
3.342433    1
3.136955    1
3.105703    1
3.085411    1
Name: count, dtype: int64

In [None]:
# ============================================================================
# CELL 7: Visualizations - Distribution Plots
# ============================================================================

if numerical_cols:
    print("Creating distribution plots for numerical variables...")
    
    # Calculate grid size
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 5*n_rows))
    
    # Handle single subplot case
    if len(numerical_cols) == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if len(numerical_cols) > 1 else [axes]
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            # Create histogram with KDE
            axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, edgecolor='black', density=True)
            
            # Add KDE curve
            from scipy import stats
            try:
                kde_data = df[col].dropna()
                if len(kde_data) > 1:
                    density = stats.gaussian_kde(kde_data)
                    xs = np.linspace(kde_data.min(), kde_data.max(), 100)
                    axes[i].plot(xs, density(xs), 'r-', linewidth=2)
            except:
                pass
            
            axes[i].set_title(f'Distribution of {col}', fontweight='bold')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Density')
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for j in range(len(numerical_cols), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================================
# CELL 8: Box Plots for Outlier Detection
# ============================================================================

if numerical_cols:
    print("Creating box plots for outlier detection...")
    
    # Calculate grid size
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 5*n_rows))
    
    # Handle single subplot case
    if len(numerical_cols) == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if len(numerical_cols) > 1 else [axes]
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            bp = axes[i].boxplot(df[col].dropna(), patch_artist=True)
            bp['boxes'][0].set_facecolor('lightblue')
            bp['boxes'][0].set_alpha(0.7)
            
            axes[i].set_title(f'Box Plot: {col}', fontweight='bold')
            axes[i].set_ylabel(col)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for j in range(len(numerical_cols), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()


In [None]:
# ============================================================================
# CELL 9: Correlation Analysis
# ============================================================================

if len(numerical_cols) > 1:
    print("=" * 60)
    print("CORRELATION ANALYSIS")
    print("=" * 60)
    
    # Calculate correlation matrix
    correlation_matrix = df[numerical_cols].corr()
    
    print("Correlation Matrix:")
    display(correlation_matrix.round(3))
    
    # Create correlation heatmap
    plt.figure(figsize=(12, 10))
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    
    # Generate heatmap
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                cmap='coolwarm', 
                center=0, 
                square=True, 
                fmt='.2f',
                cbar_kws={"shrink": .8})
    
    plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_value = correlation_matrix.iloc[i, j]
            if abs(corr_value) > 0.7:  # Threshold for high correlation
                high_corr_pairs.append((
                    correlation_matrix.columns[i], 
                    correlation_matrix.columns[j], 
                    corr_value
                ))
    
    if high_corr_pairs:
        print("\n🔍 Highly Correlated Pairs (|r| > 0.7):")
        for col1, col2, corr in high_corr_pairs:
            print(f"  • {col1} ↔ {col2}: {corr:.3f}")
    else:
        print("\n✅ No highly correlated pairs found (|r| > 0.7)")

In [None]:
# ============================================================================
# CELL 10: Categorical Data Visualization
# ============================================================================

if categorical_cols:
    print("Creating visualizations for categorical variables...")
    
    # Show first 3 categorical columns
    for col in categorical_cols[:3]:
        if df[col].nunique() <= 20:  # Only plot if not too many categories
            
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            
            # Bar plot
            value_counts = df[col].value_counts()
            value_counts.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
            ax1.set_title(f'Distribution of {col}', fontweight='bold')
            ax1.set_xlabel(col)
            ax1.set_ylabel('Count')
            ax1.tick_params(axis='x', rotation=45)
            ax1.grid(True, alpha=0.3)
            
            # Pie chart (only if <= 10 categories)
            if len(value_counts) <= 10:
                value_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
                ax2.set_title(f'Proportion of {col}', fontweight='bold')
                ax2.set_ylabel('')
            else:
                ax2.text(0.5, 0.5, f'Too many categories\nfor pie chart\n({len(value_counts)} unique values)', 
                        ha='center', va='center', transform=ax2.transAxes)
                ax2.set_xlim(0, 1)
                ax2.set_ylim(0, 1)
                ax2.set_xticks([])
                ax2.set_yticks([])
            
            plt.tight_layout()
            plt.show()

In [None]:
# ============================================================================
# CELL 11: AI Analysis (if Ollama is available)
# ============================================================================

try:
    # Test Ollama integration
    from src.ollama_helper import setup_ollama
    
    print("🤖 Connecting to AI assistant...")
    ai = setup_ollama("llama2")
    
    # Create dataset summary for AI
    summary = f"""
    Dataset Analysis Summary:
    - Shape: {df.shape[0]} rows, {df.shape[1]} columns
    - Numerical variables: {len(numerical_cols)} ({', '.join(numerical_cols[:5])})
    - Categorical variables: {len(categorical_cols)} ({', '.join(categorical_cols[:3])})
    - Missing values: {df.isnull().sum().sum()} total
    - Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB
    
    This appears to be research data from Phelps et al. 2016.
    """
    
    print("🧠 Getting AI analysis suggestions...")
    response = ai.ask(f"""
    Based on this dataset summary, what are the most important next steps for analysis?
    
    {summary}
    
    Please provide 3-5 specific, actionable recommendations for data analysis.
    """)
    
    if response and response != "Error: Could not get response":
        print("=" * 60)
        print("🤖 AI ANALYSIS SUGGESTIONS")
        print("=" * 60)
        print(response)
    else:
        print("⚠️ AI assistant not available - continuing without AI suggestions")
        
except ImportError:
    print("⚠️ Ollama helper not found - run this in Docker for AI features")
except Exception as e:
    print(f"⚠️ AI integration error: {e}")

In [None]:
# ============================================================================
# CELL 12: Summary and Next Steps
# ============================================================================

print("\n" + "=" * 60)
print("📊 EXPLORATION SUMMARY")
print("=" * 60)

print(f"✅ Dataset successfully loaded and explored")
print(f"📈 Found {len(numerical_cols)} numerical and {len(categorical_cols)} categorical variables")
print(f"🔍 Data quality: {df.isnull().sum().sum()} missing values, {df.duplicated().sum()} duplicates")

if len(numerical_cols) > 1:
    high_corr_count = len([1 for i in range(len(correlation_matrix.columns)) 
                          for j in range(i+1, len(correlation_matrix.columns)) 
                          if abs(correlation_matrix.iloc[i, j]) > 0.7])
    print(f"🔗 Found {high_corr_count} highly correlated variable pairs")

print("\n" + "=" * 40)
print("🎯 RECOMMENDED NEXT STEPS")
print("=" * 40)
print("1. 🧹 Handle missing values and outliers")
print("2. 🔬 Perform statistical tests and hypothesis testing") 
print("3. 🤖 Use AI assistant for advanced analysis suggestions")
print("4. 📊 Create publication-ready visualizations")
print("5. 🔍 Investigate interesting patterns found in the data")
print("6. 📝 Document findings and create analysis report")

print(f"\n🎉 Data exploration complete! Ready for advanced analysis.")
print(f"💡 Tip: Use the AI assistant to get specific analysis recommendations!")

In [None]:
# ============================================================================
# CELL 13: Quick Data Access Functions
# ============================================================================

def quick_summary():
    """Quick function to show dataset summary"""
    print(f"Dataset: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"Numerical: {len(numerical_cols)} variables")
    print(f"Categorical: {len(categorical_cols)} variables")
    print(f"Missing values: {df.isnull().sum().sum()}")
    return df.info()

def show_correlations(threshold=0.5):
    """Show correlations above threshold"""
    if len(numerical_cols) > 1:
        corr = df[numerical_cols].corr()
        high_corr = []
        for i in range(len(corr.columns)):
            for j in range(i+1, len(corr.columns)):
                if abs(corr.iloc[i, j]) > threshold:
                    high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i, j]))
        
        for col1, col2, corr_val in high_corr:
            print(f"{col1} ↔ {col2}: {corr_val:.3f}")
    else:
        print("Need at least 2 numerical columns for correlation analysis")

def plot_variable(column_name):
    """Quick plot for any variable"""
    if column_name in numerical_cols:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Histogram
        df[column_name].hist(bins=30, ax=ax1, alpha=0.7, edgecolor='black')
        ax1.set_title(f'Distribution of {column_name}')
        ax1.grid(True, alpha=0.3)
        
        # Box plot
        df[column_name].plot(kind='box', ax=ax2)
        ax2.set_title(f'Box Plot of {column_name}')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Summary stats
        print(f"\nSummary for {column_name}:")
        print(df[column_name].describe())
        
    elif column_name in categorical_cols:
        # Bar plot for categorical
        plt.figure(figsize=(10, 5))
        df[column_name].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title(f'Distribution of {column_name}')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nValue counts for {column_name}:")
        print(df[column_name].value_counts())
    else:
        print(f"Column '{column_name}' not found in dataset")

print("✅ Helper functions defined:")
print("  • quick_summary() - Show dataset overview")
print("  • show_correlations(threshold=0.5) - Show correlated variables") 
print("  • plot_variable('column_name') - Quick plot any variable")
print("\nExample: plot_variable('your_column_name')")