In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Locate and add the package root
cwd = Path.cwd().resolve()
for parent in [cwd, *cwd.parents]:
    if (parent / "sociopathit").exists():
        ROOT = parent
        break
else:
    raise FileNotFoundError("Could not locate the sociopathit package root.")

# Add to sys.path for imports
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print(f"Added to sys.path: {ROOT}")

Added to sys.path: C:\Users\alecw\OneDrive - University of Toronto\Directives\GITTYSBURG\sociopathit


# Sociopathit Analysis Modules - Comprehensive Test Suite

This notebook demonstrates and tests all analysis modules in the sociopathit package.

## 📚 Table of Contents

Click to jump to any section:

1. **[Setup & Initialization](#setup)** - Import packages and configure environment
2. **[Regression Models](#regression)** - OLS, Logit, Poisson, Multilevel
3. **[Publication Tables](#pubtable)** - Proportion, Descriptive, Regression tables
4. **[Descriptive Statistics](#descriptive)** - Correlations, Crosstabs, Group summaries
5. **[Causal Inference](#causal)** - Propensity scores, DiD, IV regression
6. **[Panel Data Models](#panel)** - Fixed effects, Random effects, First differences
7. **[Machine Learning](#ml)** - Random Forest, Feature importance
8. **[Stata .dta Files](#stata)** - Working with categorical variables
9. **[Text Analysis](#textanalysis)** - NLP, topic modeling, complexity scores, similarity
10. **[Network Data](#network)** - Edge lists, adjacency matrices, co-occurrence, bipartite networks

---

## 🚀 Quick Start

```python
# Example: Run a regression with categorical variables from .dta
from sociopathit.data.loading import load_stata
from sociopathit.analyses.regress import ols

df = load_stata('survey.dta', convert_categoricals=True)
model = ols(df, 'outcome', ['age', 'income', 'education'], robust=True)
print(model.summary())
```

---

<a id='setup'></a>
## 1. Setup & Initialization

Initialize the environment and import core packages.

<a id='regression'></a>
# 2. Regression Models

Test OLS, Logit, Poisson regression with the `regress` module.

In [2]:
import importlib
from sociopathit.analyses import regress as regress_module

importlib.reload(regress_module)
from sociopathit.analyses.regress import ols, logit, poisson, RegressionModel, compare_models

# Simulate data for regression
np.random.seed(42)
n = 200
df_regress = pd.DataFrame({
    'age': np.random.normal(45, 15, n),
    'income': np.random.normal(50000, 20000, n),
    'education': np.random.choice([12, 14, 16, 18], n),
    'satisfaction': np.random.normal(7, 2, n),
    'employed': np.random.choice([0, 1], n, p=[0.2, 0.8]),
    'count_events': np.random.poisson(3, n),
    'weight': np.random.uniform(0.5, 1.5, n)
})

# Make satisfaction related to predictors
df_regress['satisfaction'] = (
    5 + 
    0.03 * df_regress['age'] + 
    0.00002 * df_regress['income'] + 
    0.2 * df_regress['education'] +
    np.random.normal(0, 1, n)
)

print(df_regress.head())
print(f"\nData shape: {df_regress.shape}")

         age        income  education  satisfaction  employed  count_events  \
0  52.450712  57155.747207         16     10.270672         1             2   
1  42.926035  61215.690527         16      9.912903         1             1   
2  54.715328  71661.024864         12      9.991937         1             0   
3  67.845448  71076.041041         12      9.903556         1             4   
4  41.487699  22446.612641         18     10.416234         1             3   

     weight  
0  1.484670  
1  1.437388  
2  0.543174  
3  0.664815  
4  0.631729  

Data shape: (200, 7)


### 2.1 Setup Test Data

In [3]:
# Test 1: OLS Regression
print("=" * 60)
print("TEST 1: OLS Regression")
print("=" * 60)

model1 = ols(
    df=df_regress,
    outcome='satisfaction',
    inputs=['age', 'income', 'education'],
    robust=True
)

print("\nModel Summary:")
print(model1.summary())

print("\nTidy Results:")
print(model1.get_tidy())

print("\nModel Statistics:")
print(model1.get_stats())

TEST 1: OLS Regression

Model Summary:
                            OLS Regression Results                            
Dep. Variable:           satisfaction   R-squared:                       0.343
Model:                            OLS   Adj. R-squared:                  0.332
Method:                 Least Squares   F-statistic:                     43.58
Date:                Thu, 16 Oct 2025   Prob (F-statistic):           1.27e-21
Time:                        18:49:47   Log-Likelihood:                -272.29
No. Observations:                 200   AIC:                             552.6
Df Residuals:                     196   BIC:                             565.8
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        

### 2.2 OLS & Logistic Regression Tests

In [4]:
# Test 2: Weighted OLS Regression
print("=" * 60)
print("TEST 2: Weighted OLS Regression")
print("=" * 60)

model2 = ols(
    df=df_regress,
    outcome='satisfaction',
    inputs=['age', 'income', 'education'],
    weight='weight',
    robust=True
)

print("\nTidy Results (Weighted):")
print(model2.get_tidy())

TEST 2: Weighted OLS Regression

Tidy Results (Weighted):
        term  estimate  std.error  statistic       p.value  conf.low  \
0      const  5.165550   0.550893   9.376689  6.807718e-21  4.085820   
1        age  0.027861   0.005502   5.063826  4.109257e-07  0.017078   
2     income  0.000015   0.000004   3.976676  6.988537e-05  0.000008   
3  education  0.219408   0.033506   6.548376  5.816620e-11  0.153738   

   conf.high  
0   6.245280  
1   0.038645  
2   0.000022  
3   0.285078  


In [5]:
# Test 3: Logistic Regression
print("=" * 60)
print("TEST 3: Logistic Regression")
print("=" * 60)

model3 = logit(
    df=df_regress,
    outcome='employed',
    inputs=['age', 'education'],
    robust=True
)

print("\nLogistic Regression Results:")
print(model3.get_tidy())

print("\nModel Statistics:")
print(model3.get_stats())

TEST 3: Logistic Regression
Optimization terminated successfully.
         Current function value: 0.495807
         Iterations 5

Logistic Regression Results:
        term  estimate  std.error  statistic   p.value  conf.low  conf.high
0      const  0.949878   1.411842   0.672793  0.501079 -1.817283   3.717038
1        age -0.013852   0.012969  -1.068082  0.285483 -0.039271   0.011567
2  education  0.072462   0.086165   0.840969  0.400366 -0.096418   0.241342

Model Statistics:
{'N': 200, 'AIC': np.float64(204.3227910665525), 'BIC': np.float64(214.2177431661966), 'Log-Likelihood': np.float64(-99.16139553327625), 'Pseudo_R_squared': np.float64(0.009183500482258689)}


In [6]:
# Test 4: Poisson Regression
print("=" * 60)
print("TEST 4: Poisson Regression")
print("=" * 60)

model4 = poisson(
    df=df_regress,
    outcome='count_events',
    inputs=['age', 'income'],
    robust=True
)

print("\nPoisson Regression Results:")
print(model4.get_tidy())

TEST 4: Poisson Regression
         Current function value: 149651730807018435982530780671770624000.000000
         Iterations: 35

Poisson Regression Results:
     term   estimate     std.error     statistic   p.value      conf.low  \
0   const -32.962559  1.658551e+07 -1.987431e-06  0.999998 -3.250703e+07   
1     age  -0.005334  1.996745e+05 -2.671477e-08  1.000000 -3.913549e+05   
2  income   0.000995  4.396441e+01  2.263666e-05  0.999982 -8.616766e+01   

      conf.high  
0  3.250696e+07  
1  3.913549e+05  
2  8.616965e+01  


In [7]:
# Test 5: Model Comparison
print("=" * 60)
print("TEST 5: Model Comparison")
print("=" * 60)

# Create two models with different specifications
model_simple = ols(df_regress, 'satisfaction', 'age')
model_full = ols(df_regress, 'satisfaction', ['age', 'income', 'education'])

comparison = compare_models(model_simple, model_full)
print("\nModel Comparison:")
print(comparison)

TEST 5: Model Comparison

Model Comparison:
     Model                  Inputs    N         AIC         BIC  \
0  Model 1                     age  200  601.672931  608.269566   
1  Model 2  age, income, education  200  552.580030  565.773299   

   Log-Likelihood  R_squared  Adj_R_squared  
0     -298.836465   0.142634       0.138304  
1     -272.290015   0.342529       0.332466  


In [8]:
# Test 6: Predictions
print("=" * 60)
print("TEST 6: Predictions")
print("=" * 60)

predictions = model1.predict()
print(f"\nPredicted values (first 10): {predictions[:10]}")

# New data predictions
new_data = pd.DataFrame({
    'age': [30, 40, 50],
    'income': [40000, 50000, 60000],
    'education': [16, 16, 18]
})
new_predictions = model1.predict(new_data)
print(f"\nPredictions for new data: {new_predictions}")

TEST 6: Predictions

Predicted values (first 10): 0    11.009774
1    10.799499
2    10.407709
3    10.768853
4    10.643722
5    10.332796
6    11.075364
7    11.604443
8     9.774625
9    11.151095
dtype: float64

Predictions for new data: 0    10.134068
1    10.557665
2    11.417297
dtype: float64


In [9]:
# Test 7: VIF (Multicollinearity Check)
print("=" * 60)
print("TEST 7: Variance Inflation Factors")
print("=" * 60)

vif_results = model1.vif()
print("\nVIF Values:")
print(vif_results)
print("\nNote: VIF > 10 indicates potential multicollinearity")

TEST 7: Variance Inflation Factors

VIF Values:
    Variable       VIF
1        age  1.012199
2     income  1.020473
3  education  1.013279

Note: VIF > 10 indicates potential multicollinearity


<a id='pubtable'></a>
# 3. Publication Tables

Create publication-ready tables with the `pubtable` module.

In [10]:
import importlib
from sociopathit.analyses import pubtable as pubtable_module
from IPython.display import HTML, display

importlib.reload(pubtable_module)
from sociopathit.analyses.pubtable import (
    proportion_table, 
    descriptive_table, 
    regression_table,
    save_table
)

# Simulate survey data
np.random.seed(42)
n = 300
df_survey = pd.DataFrame({
    'gender': np.random.choice(['Male', 'Female'], n),
    'education': np.random.choice(['High School', 'Bachelor', 'Graduate'], n),
    'response': np.random.choice(['Yes', 'No', 'Maybe'], n, p=[0.4, 0.4, 0.2]),
    'age': np.random.normal(40, 15, n),
    'income': np.random.normal(55000, 25000, n),
    'satisfaction': np.random.normal(7, 2, n),
    'weight': np.random.uniform(0.8, 1.2, n)
})

print(df_survey.head())
print(f"\nData shape: {df_survey.shape}")

   gender    education response        age         income  satisfaction  \
0    Male  High School       No  30.743644  106059.910568      7.746792   
1  Female  High School    Maybe  21.117369   67129.377631      8.014032   
2    Male     Graduate       No  57.832015   58632.405994      7.396579   
3    Male     Bachelor      Yes  39.380374   97839.349771      6.480788   
4    Male     Bachelor      Yes  42.609495   80253.424981      7.361291   

     weight  
0  1.190396  
1  1.066940  
2  0.804171  
3  0.902165  
4  0.904285  

Data shape: (300, 7)


In [11]:
# Test 1: Proportion Table (One-way)
print("=" * 60)
print("TEST 1: One-Way Proportion Table")
print("=" * 60)

html1 = proportion_table(
    df=df_survey,
    row_var='response',
    title='Distribution of Responses',
    ci=True,
    show_n=True
)

display(HTML(html1))

TEST 1: One-Way Proportion Table


response,%,95% CI,n
Maybe,18.0,"[10.0, 30.3]",54
No,42.0,"[33.7, 50.7]",126
Yes,40.0,"[31.7, 48.9]",120


In [12]:
# Test 2: Proportion Table (Cross-tabulation)
print("=" * 60)
print("TEST 2: Cross-Tabulation Proportion Table")
print("=" * 60)

html2 = proportion_table(
    df=df_survey,
    row_var='response',
    col_var='gender',
    title='Response by Gender',
    decimals=1
)

display(HTML(html2))

TEST 2: Cross-Tabulation Proportion Table


response,Female,Male
Maybe,20.8,15.2
No,38.9,45.0
Yes,40.3,39.7


In [13]:
# Test 3: Weighted Proportion Table
print("=" * 60)
print("TEST 3: Weighted Proportion Table")
print("=" * 60)

html3 = proportion_table(
    df=df_survey,
    row_var='education',
    weight_var='weight',
    title='Education Distribution (Weighted)',
    ci=True,
    show_n=True
)

display(HTML(html3))

TEST 3: Weighted Proportion Table


education,%,95% CI,n
Bachelor,35.1,"[26.6, 44.7]",104
Graduate,31.0,"[22.6, 41.0]",94
High School,33.9,"[25.4, 43.5]",102


In [14]:
# Test 4: Descriptive Statistics Table
print("=" * 60)
print("TEST 4: Descriptive Statistics Table")
print("=" * 60)

html4 = descriptive_table(
    df=df_survey,
    variables=['age', 'income', 'satisfaction'],
    stats=['mean', 'sd', 'min', 'max', 'n'],
    decimals=2,
    title='Descriptive Statistics',
    var_labels={'age': 'Age (years)', 'income': 'Annual Income ($)', 'satisfaction': 'Life Satisfaction'}
)

display(HTML(html4))

TEST 4: Descriptive Statistics Table


Variable,Mean,Sd,Min,Max,N
Age (years),41.73,16.11,-1.77,84.64,300
Annual Income ($),57483.38,26825.21,-10246.04,134184.29,300
Life Satisfaction,7.08,2.04,1.97,12.4,300


In [15]:
# Test 5: Grouped Descriptive Statistics
print("=" * 60)
print("TEST 5: Grouped Descriptive Statistics")
print("=" * 60)

html5 = descriptive_table(
    df=df_survey,
    variables=['age', 'income'],
    group_var='gender',
    stats=['mean', 'sd', 'n'],
    decimals=1,
    title='Demographics by Gender'
)

display(HTML(html5))

TEST 5: Grouped Descriptive Statistics


Variable,Group,Mean,Sd,N
age,Female,42.2,15.4,149
,Male,41.2,16.8,151
income,Female,57561.4,26690.6,149
,Male,57406.4,26957.1,151


In [16]:
# Test 6: Regression Table (Single Model)
print("=" * 60)
print("TEST 6: Regression Table (Single Model)")
print("=" * 60)

# Fit a model
model_pub = ols(df_survey, 'satisfaction', ['age', 'income'])
results_df = model_pub.get_tidy()

html6 = regression_table(
    models=results_df,
    title='OLS Regression: Life Satisfaction',
    show_se=True,
    show_stars=True,
    var_labels={'age': 'Age', 'income': 'Income', 'const': 'Intercept'},
    stats_rows={
        'N': [int(model_pub.get_stats()['N'])],
        'R_squared': [model_pub.get_stats()['R_squared']]
    }
)

display(HTML(html6))

TEST 6: Regression Table (Single Model)


Unnamed: 0,Model 1
Intercept,7.958***
,(0.406)
Age,-0.009
,(0.008)
Income,-0.000*
,(0.000)
N,300.00
R_squared,0.02


In [17]:
# Test 7: Regression Table (Multiple Models)
print("=" * 60)
print("TEST 7: Regression Table (Multiple Models)")
print("=" * 60)

# Fit multiple models
model_a = ols(df_survey, 'satisfaction', 'age')
model_b = ols(df_survey, 'satisfaction', ['age', 'income'])

html7 = regression_table(
    models=[model_a.get_tidy(), model_b.get_tidy()],
    model_names=['Model 1', 'Model 2'],
    title='Regression Models Comparison',
    show_se=True,
    show_stars=True,
    var_labels={'age': 'Age', 'income': 'Income', 'const': 'Intercept'},
    stats_rows={
        'N': [int(model_a.get_stats()['N']), int(model_b.get_stats()['N'])],
        'R_squared': [model_a.get_stats()['R_squared'], model_b.get_stats()['R_squared']]
    }
)

display(HTML(html7))

TEST 7: Regression Table (Multiple Models)


Unnamed: 0,Model 1,Model 2
Intercept,7.495***,7.958***
,(0.348),(0.406)
Age,-0.010,-0.009
,(0.008),(0.008)
Income,,-0.000*
,,(0.000)
N,300.00,300.00
R_squared,0.01,0.02


In [18]:
# Test 8: Regression Table with Confidence Intervals
print("=" * 60)
print("TEST 8: Regression Table with Confidence Intervals")
print("=" * 60)

html8 = regression_table(
    models=model_pub.get_tidy(),
    title='OLS Regression with Confidence Intervals',
    show_se=False,
    show_ci=True,
    show_stars=True,
    var_labels={'age': 'Age', 'income': 'Income', 'const': 'Intercept'}
)

display(HTML(html8))

TEST 8: Regression Table with Confidence Intervals


Unnamed: 0,Model 1
Intercept,7.958***
,"[7.162, 8.754]"
Age,-0.009
,"[-0.025, 0.006]"
Income,-0.000*
,"[-0.000, -0.000]"


# Summary - All Tests Complete ✅

All analysis modules have been successfully tested!

## Modules Tested

### Core Analyses
1. **Regression Module** (`regress.py`)
   - ✅ OLS regression (weighted & unweighted)
   - ✅ Logistic regression
   - ✅ Poisson regression
   - ✅ Model comparison
   - ✅ Predictions & diagnostics (VIF)

2. **Publication Tables Module** (`pubtable.py`)
   - ✅ Proportion tables (one-way & cross-tabulation)
   - ✅ Weighted proportion tables
   - ✅ Descriptive statistics tables
   - ✅ Grouped descriptive statistics
   - ✅ Regression coefficient tables
   - ✅ Confidence intervals & significance stars

### New Modules (2025-10-16)
3. **Descriptive Statistics Module** (`descriptive.py`)
   - ✅ Correlation matrices (Pearson, Spearman, Kendall)
   - ✅ Cross-tabulations with chi-square tests
   - ✅ Effect sizes (Cramér's V, phi)
   - ✅ Grouped summary statistics
   - ✅ Group comparison tests (ANOVA, t-test)
   - ✅ Distribution tests for normality

4. **Causal Inference Module** (`causal.py`)
   - ✅ Propensity score analysis (IPW, matching, regression adjustment)
   - ✅ Balance checking for covariate balance
   - ✅ Difference-in-differences (DiD) analysis
   - ✅ Instrumental variables (2SLS) regression
   - ✅ Regression discontinuity design (RDD)

5. **Panel Data Module** (`panel.py`)
   - ✅ Fixed effects (within) estimation
   - ✅ Random effects (GLS) estimation
   - ✅ First-difference estimation
   - ✅ Hausman specification test
   - ✅ Panel descriptive statistics

6. **Machine Learning Module** (`ml.py`)
   - ✅ Automated preprocessing pipelines
   - ✅ Random Forest (classification & regression)
   - ✅ Cross-validation
   - ✅ Feature importance extraction
   - ✅ Performance metrics

### Data Handling
7. **.dta Files with Categorical Variables**
   - ✅ Loading with `load_stata()`
   - ✅ Ordered categorical preservation
   - ✅ Unordered categorical handling
   - ✅ Variable labels preservation
   - ✅ Integration with all analysis modules

## Quick Navigation
Use the table of contents at the top to jump to any section!

## Dependencies Status
- ✅ **Required**: pandas, numpy, statsmodels
- ✅ **Recommended**: scipy (for descriptive stats)
- ✅ **Optional**: sklearn (for ML), linearmodels (for enhanced panel features)

---
**Package**: sociopathit  
**Test Date**: 2025-10-16  
**Status**: All modules functional and tested

<a id='descriptive'></a>
# 4. Descriptive Statistics

Correlation matrices, crosstabs, grouped summaries with the `descriptive` module.

In [19]:
import importlib
from sociopathit.analyses import descriptive as desc_module

importlib.reload(desc_module)
from sociopathit.analyses.descriptive import (
    correlation_matrix, crosstab, group_summary, 
    distribution_test, describe_by_group, compare_groups
)

# Generate test data
np.random.seed(42)
n = 250
df_desc = pd.DataFrame({
    'age': np.random.normal(40, 12, n),
    'income': np.random.normal(55000, 15000, n),
    'education': np.random.normal(14, 2, n),
    'satisfaction': np.random.normal(7, 1.5, n),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n),
    'gender': np.random.choice(['Male', 'Female'], n),
    'employed': np.random.choice(['Yes', 'No'], n, p=[0.7, 0.3]),
    'weight': np.random.uniform(0.8, 1.2, n)
})

# Add some correlation
df_desc['income'] = df_desc['income'] + df_desc['education'] * 3000
df_desc['satisfaction'] = df_desc['satisfaction'] + df_desc['income'] / 20000

print(df_desc.head())
print(f"\nData shape: {df_desc.shape}")

         age         income  education  satisfaction region  gender employed  \
0  45.960570   83643.805970  15.852355      9.797340   East    Male       No   
1  38.340828  122224.429049  17.818833     11.083695   West    Male      Yes   
2  47.772262  120440.937512  11.202865     11.558237  North  Female      Yes   
3  58.276358  115864.794328  15.125938     14.373702   West  Female      Yes   
4  37.190160   70305.595096  12.698715      9.091181   East  Female       No   

     weight  
0  1.086529  
1  0.828834  
2  0.828503  
3  0.804843  
4  1.182601  

Data shape: (250, 8)


In [20]:
# Test 1: Correlation Matrix
print("=" * 60)
print("TEST 1: Correlation Matrix (Pearson)")
print("=" * 60)

corr_mat, pval_mat = correlation_matrix(
    df_desc, 
    variables=['age', 'income', 'education', 'satisfaction'],
    method='pearson'
)

print("\nCorrelation Matrix:")
print(corr_mat.round(3))
print("\nP-values:")
print(pval_mat.round(3))

TEST 1: Correlation Matrix (Pearson)

Correlation Matrix:
                age  income  education  satisfaction
age           1.000  -0.006      0.027         0.034
income       -0.006   1.000      0.408         0.365
education     0.027   0.408      1.000         0.170
satisfaction  0.034   0.365      0.170         1.000

P-values:
                age  income  education  satisfaction
age           0.000   0.927      0.668         0.596
income        0.927   0.000      0.000         0.000
education     0.668   0.000      0.000         0.007
satisfaction  0.596   0.000      0.007         0.000


In [21]:
# Test 2: Crosstab with Chi-Square
print("=" * 60)
print("TEST 2: Crosstab with Chi-Square Test")
print("=" * 60)

crosstab_result = crosstab(
    df_desc,
    row_var='gender',
    col_var='employed',
    show_chi2=True,
    show_effect_size=True
)

print("\nContingency Table:")
print(crosstab_result['table'])
print(f"\nChi-square: {crosstab_result['chi2']:.3f}")
print(f"P-value: {crosstab_result['p_value']:.3f}")
print(f"Cramér's V: {crosstab_result['cramers_v']:.3f}")

TEST 2: Crosstab with Chi-Square Test

Contingency Table:
employed  No  Yes
gender           
Female    36   89
Male      34   91

Chi-square: 0.020
P-value: 0.888
Cramér's V: 0.009


In [22]:
# Test 3: Group Summary
print("=" * 60)
print("TEST 3: Grouped Descriptive Statistics")
print("=" * 60)

group_stats = group_summary(
    df_desc,
    variables=['income', 'satisfaction'],
    group_var='region',
    stats=['count', 'mean', 'std']
)

print("\nGrouped Statistics by Region:")
print(group_stats)

# Test 4: Compare Groups
print("\n" + "=" * 60)
print("TEST 4: Compare Groups (ANOVA)")
print("=" * 60)

comparison = compare_groups(df_desc, 'income', 'region', test='anova')
print(f"\nTest: {comparison['test']}")
print(f"Statistic: {comparison['statistic']:.3f}")
print(f"P-value: {comparison['p_value']:.3f}")

TEST 3: Grouped Descriptive Statistics

Grouped Statistics by Region:
  region  income_count   income_mean    income_std  satisfaction_count  \
0   East            62  97312.505805  15427.685956                  62   
1  North            62  97465.056178  14765.904839                  62   
2  South            61  98012.944346  16248.920807                  61   
3   West            65  94682.561924  18839.202779                  65   

   satisfaction_mean  satisfaction_std  
0          12.015529          1.697530  
1          11.948252          1.400435  
2          12.151954          1.393657  
3          12.039471          1.580121  

TEST 4: Compare Groups (ANOVA)

Test: One-way ANOVA
Statistic: 0.524
P-value: 0.666


<a id='causal'></a>
# 5. Causal Inference

Propensity scores, DiD, IV regression with the `causal` module.

In [23]:
import importlib
from sociopathit.analyses import causal as causal_module

importlib.reload(causal_module)
from sociopathit.analyses.causal import (
    propensity_score, difference_in_differences, 
    instrumental_variables, regression_discontinuity
)

# Generate causal inference test data
np.random.seed(42)
n = 400

# Propensity score data
df_ps = pd.DataFrame({
    'age': np.random.normal(45, 10, n),
    'income': np.random.normal(50000, 15000, n),
    'education': np.random.normal(14, 2, n),
})

# Treatment assignment based on covariates (selection bias)
ps_true = 1 / (1 + np.exp(-(
    -2 + 0.03*df_ps['age'] + 0.00002*df_ps['income'] + 0.2*df_ps['education']
)))
df_ps['treated'] = (np.random.random(n) < ps_true).astype(int)

# Outcome with treatment effect
treatment_effect = 5.0
df_ps['outcome'] = (
    50 + 
    0.5*df_ps['age'] + 
    0.0001*df_ps['income'] + 
    2*df_ps['education'] +
    treatment_effect*df_ps['treated'] +
    np.random.normal(0, 5, n)
)

print("Propensity Score Data:")
print(df_ps.head())
print(f"Treatment rate: {df_ps['treated'].mean():.2%}")

Propensity Score Data:
         age        income  education  treated     outcome
0  49.967142  26083.585118  15.876568        1  110.760953
1  43.617357  41009.374656  12.967911        1   97.512754
2  51.476885  50078.655496  14.192242        1  113.717388
3  60.230299  50704.708906  13.075449        1  115.727781
4  42.658466  43249.017928  13.131008        1  114.483399
Treatment rate: 96.25%


In [24]:
# Test 1: Propensity Score Analysis with IPW
print("=" * 60)
print("TEST 1: Propensity Score Analysis (IPW)")
print("=" * 60)

ps_model = propensity_score(
    df_ps,
    treatment='treated',
    covariates=['age', 'income', 'education'],
    outcome='outcome'
)

print("\nPropensity scores (first 10):")
print(ps_model.propensity_scores.head(10))

# Estimate ATE
ate_result = ps_model.estimate_ate(method='ipw')
print(f"\nAverage Treatment Effect (IPW):")
print(f"  ATE: {ate_result['ate']:.3f}")
print(f"  SE: {ate_result['se']:.3f}")
print(f"  95% CI: [{ate_result['ci_lower']:.3f}, {ate_result['ci_upper']:.3f}]")
print(f"  P-value: {ate_result['p_value']:.3f}")
print(f"\nTrue treatment effect: {treatment_effect:.3f}")

TEST 1: Propensity Score Analysis (IPW)

Propensity scores (first 10):
0    0.955312
1    0.977163
2    0.956110
3    0.942455
4    0.977343
5    0.973621
6    0.935026
7    0.961538
8    0.963857
9    0.967703
dtype: float64

Average Treatment Effect (IPW):
  ATE: 5.249
  SE: 1.860
  95% CI: [1.604, 8.894]
  P-value: 0.005

True treatment effect: 5.000


In [25]:
# Test 2: Difference-in-Differences
print("=" * 60)
print("TEST 2: Difference-in-Differences")
print("=" * 60)

# Generate DiD data
np.random.seed(42)
n_units = 100
df_did = pd.DataFrame({
    'unit': np.repeat(range(n_units), 2),
    'time': np.tile([0, 1], n_units),
    'treated': np.repeat([0]*50 + [1]*50, 2),
})

# Outcome: parallel trends with DiD effect
did_effect = 8.0
df_did['outcome'] = (
    20 + 
    5*df_did['treated'] +  # Group difference
    3*df_did['time'] +  # Time trend
    did_effect*df_did['treated']*df_did['time'] +  # DiD effect
    np.random.normal(0, 2, len(df_did))
)

did_result = difference_in_differences(
    df_did,
    outcome='outcome',
    treatment='treated',
    time='time',
    unit='unit'
)

print(f"\nDiD Estimate: {did_result['did_estimate']:.3f}")
print(f"SE: {did_result['se']:.3f}")
print(f"P-value: {did_result['p_value']:.3f}")
print(f"95% CI: [{did_result['ci_lower']:.3f}, {did_result['ci_upper']:.3f}]")
print(f"\nTrue DiD effect: {did_effect:.3f}")

TEST 2: Difference-in-Differences

DiD Estimate: 8.344
SE: 0.527
P-value: 0.000
95% CI: [7.311, 9.377]

True DiD effect: 8.000


<a id='sem'></a>
# 5. Structural Equation Modeling (SEM)

Test path analysis and mediation models.

In [26]:
import importlib
from sociopathit.analyses import sem as sem_module
importlib.reload(sem_module)
from sociopathit.analyses.sem import PathModel, mediation, path_analysis

print("SEM module loaded successfully")

SEM module loaded successfully


In [27]:
# Test 1: Simple Mediation Analysis
print("=" * 60)
print("TEST 1: Simple Mediation Model")
print("=" * 60)

# Simulate mediation data
np.random.seed(42)
n = 300
df_mediation = pd.DataFrame({
    'treatment': np.random.choice([0, 1], n),
    'mediator': np.random.normal(50, 10, n),
    'outcome': np.random.normal(100, 15, n),
    'confounder': np.random.normal(0, 1, n)
})

# Create mediation structure
df_mediation['mediator'] = (
    40 + 10 * df_mediation['treatment'] + 
    5 * df_mediation['confounder'] +
    np.random.normal(0, 5, n)
)
df_mediation['outcome'] = (
    80 + 8 * df_mediation['mediator'] + 
    5 * df_mediation['treatment'] +
    3 * df_mediation['confounder'] +
    np.random.normal(0, 8, n)
)

print(f"\nData shape: {df_mediation.shape}")
print(f"Treatment groups: {df_mediation['treatment'].value_counts().to_dict()}")

# Fit mediation model
model_med = mediation(
    df=df_mediation,
    x='treatment',
    m='mediator',
    y='outcome',
    standardize=True
)

print("\n" + model_med.summary())

# Calculate indirect effect
indirect, se = model_med.indirect_effect(['treatment', 'mediator', 'outcome'])
print(f"\nIndirect Effect: {indirect:.4f} (SE = {se:.4f})")
print(f"Z-score: {indirect/se:.2f}")
print(f"Significant: {'Yes' if abs(indirect/se) > 1.96 else 'No'}")

TEST 1: Simple Mediation Model

Data shape: (300, 4)
Treatment groups: {0: 151, 1: 149}

Path Model Summary

Model Fit:
  N: 300
  N_parameters: 5
  Log-Likelihood: -130.586
  AIC: 271.172
  BIC: 289.691
  R_squared_mediator: 0.406
  R_squared_outcome: 0.986


Path Coefficients:
     from       to  estimate  std.error       p.value  standardized
treatment mediator  0.637211   0.044645  1.406375e-35      0.637211
 mediator  outcome  0.984856   0.008869 6.742835e-244      0.984856
treatment  outcome  0.012764   0.008869  1.511755e-01      0.012764

Indirect Effect: 0.6276 (SE = 0.0443)
Z-score: 14.16
Significant: Yes


In [28]:
# Test 2: Complex Path Model
print("=" * 60)
print("TEST 2: Multiple Mediator Path Analysis")
print("=" * 60)

# Simulate complex path data
np.random.seed(42)
n = 400
df_path = pd.DataFrame({
    'X': np.random.normal(0, 1, n),
    'M1': np.random.normal(0, 1, n),
    'M2': np.random.normal(0, 1, n),
    'Y': np.random.normal(0, 1, n)
})

# Create path structure: X -> M1 -> M2 -> Y
df_path['M1'] = 0.6 * df_path['X'] + np.random.normal(0, 0.8, n)
df_path['M2'] = 0.5 * df_path['M1'] + 0.3 * df_path['X'] + np.random.normal(0, 0.8, n)
df_path['Y'] = 0.4 * df_path['M2'] + 0.2 * df_path['M1'] + 0.1 * df_path['X'] + np.random.normal(0, 0.8, n)

# Define path equations
equations = {
    'M1': ['X'],
    'M2': ['M1', 'X'],
    'Y': ['M2', 'M1', 'X']
}

# Fit path model
model_path = PathModel(df_path, equations, standardize=True)
model_path.fit()

print("\n" + model_path.summary())

# Get path coefficients
paths = model_path.get_paths()
print("\nPath Coefficients Table:")
print(paths.to_string(index=False))

# Calculate indirect effects
indirect_1 = model_path.indirect_effect(['X', 'M1', 'Y'])
indirect_2 = model_path.indirect_effect(['X', 'M2', 'Y'])
indirect_3 = model_path.indirect_effect(['X', 'M1', 'M2'])

print(f"\nIndirect Effects:")
print(f"  X -> M1 -> Y: {indirect_1[0]:.4f} (SE = {indirect_1[1]:.4f})")
print(f"  X -> M2 -> Y: {indirect_2[0]:.4f} (SE = {indirect_2[1]:.4f})")
print(f"  X -> M1 -> M2: {indirect_3[0]:.4f} (SE = {indirect_3[1]:.4f})")

TEST 2: Multiple Mediator Path Analysis

Path Model Summary

Model Fit:
  N: 400
  N_parameters: 9
  Log-Likelihood: -1398.709
  AIC: 2815.419
  BIC: 2851.342
  R_squared_M1: 0.320
  R_squared_M2: 0.447
  R_squared_Y: 0.414


Path Coefficients:
from to  estimate  std.error      p.value  standardized
   X M1  0.566083   0.041321 2.874724e-35      0.566083
  M1 M2  0.467597   0.045292 2.716231e-22      0.467597
   X M2  0.281190   0.045292 1.350078e-09      0.281190
  M2  Y  0.456968   0.051705 3.225384e-17      0.456968
  M1  Y  0.133183   0.052552 1.165154e-02      0.133183
   X  Y  0.140754   0.048873 4.193093e-03      0.140754

Path Coefficients Table:
from to  estimate  std.error      p.value  standardized
   X M1  0.566083   0.041321 2.874724e-35      0.566083
  M1 M2  0.467597   0.045292 2.716231e-22      0.467597
   X M2  0.281190   0.045292 1.350078e-09      0.281190
  M2  Y  0.456968   0.051705 3.225384e-17      0.456968
  M1  Y  0.133183   0.052552 1.165154e-02      0.133183
 

<a id='panel'></a>
# 6. Panel Data Models

Fixed effects, random effects, first differences with the `panel` module.

In [29]:
import importlib
from sociopathit.analyses import panel as panel_module

importlib.reload(panel_module)
from sociopathit.analyses.panel import (
    fixed_effects, random_effects, first_differences,
    hausman_test, panel_summary
)

# Generate panel data
np.random.seed(42)
n_entities = 50
n_time = 5

df_panel = pd.DataFrame({
    'entity': np.repeat(range(n_entities), n_time),
    'time': np.tile(range(n_time), n_entities),
})

# Add entity fixed effects
entity_effects = np.random.normal(0, 5, n_entities)
df_panel['entity_effect'] = entity_effects[df_panel['entity']]

# Add time-varying regressors
df_panel['x1'] = np.random.normal(10, 2, len(df_panel))
df_panel['x2'] = np.random.normal(5, 1, len(df_panel))

# Outcome with fixed effects
df_panel['y'] = (
    df_panel['entity_effect'] +
    2*df_panel['x1'] +
    -1*df_panel['x2'] +
    np.random.normal(0, 1, len(df_panel))
)

print("Panel Data:")
print(df_panel.head(10))
print(f"\nData shape: {df_panel.shape}")
print(f"Entities: {df_panel['entity'].nunique()}, Time periods: {df_panel['time'].nunique()}")

Panel Data:
   entity  time  entity_effect         x1        x2          y
0       0     0       2.483571  10.648168  4.171005  19.695491
1       0     1       2.483571   9.229835  4.439819  16.347745
2       0     2       2.483571   8.646156  5.747294  15.196371
3       0     3       2.483571  11.223353  5.610370  19.574326
4       0     4       2.483571  12.061999  4.979098  21.966073
5       1     0      -0.691322  11.862560  5.117327  17.504595
6       1     1      -0.691322   8.321565  6.277665   9.186537
7       1     2      -0.691322   9.381575  4.408429  13.230842
8       1     3      -0.691322  10.662527  5.547097  15.481087
9       1     4      -0.691322  11.951090  4.797807  17.992067

Data shape: (250, 6)
Entities: 50, Time periods: 5


In [30]:
# Test 1: Fixed Effects Model
print("=" * 60)
print("TEST 1: Fixed Effects (Within) Estimator")
print("=" * 60)

fe_result = fixed_effects(
    df_panel,
    outcome='y',
    inputs=['x1', 'x2'],
    entity='entity',
    time='time'
)

print("\nFixed Effects Coefficients:")
print(fe_result['coefficients'])
print("\nFit Statistics:")
for key, val in fe_result['fit_stats'].items():
    if isinstance(val, float):
        print(f"  {key}: {val:.4f}")
    else:
        print(f"  {key}: {val}")
        
print("\nTrue coefficients: x1=2.0, x2=-1.0")

TEST 1: Fixed Effects (Within) Estimator

Fixed Effects Coefficients:
  variable  estimate  std.error     t_stat       p.value
0       x1  2.003788   0.031433  63.747573  0.000000e+00
1       x2 -0.942080   0.065169 -14.455985  2.298171e-47

Fit Statistics:
  N: 250
  N_entities: 50
  R_squared: 0.9500

True coefficients: x1=2.0, x2=-1.0


<a id='ml'></a>
# 7. Machine Learning

Random Forest, feature importance, predictive modeling with the `ml` module.

In [31]:
import importlib
from sociopathit.analyses import ml as ml_module

importlib.reload(ml_module)
from sociopathit.analyses.ml import train_model, MLModel, feature_importance

# Generate ML test data
np.random.seed(42)
n = 500

df_ml = pd.DataFrame({
    'age': np.random.normal(40, 12, n),
    'income': np.random.normal(55000, 15000, n),
    'education': np.random.normal(14, 2, n),
    'experience': np.random.normal(15, 8, n),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n),
})

# Classification target (employed)
employment_prob = 1 / (1 + np.exp(-(
    -5 +
    0.05*df_ml['age'] +
    0.00003*df_ml['income'] +
    0.3*df_ml['education'] +
    0.1*df_ml['experience']
)))
df_ml['employed'] = (np.random.random(n) < employment_prob).astype(int)

# Regression target (salary)
df_ml['salary'] = (
    20000 +
    500*df_ml['age'] +
    0.5*df_ml['income'] +
    3000*df_ml['education'] +
    1000*df_ml['experience'] +
    np.random.normal(0, 5000, n)
)

print("ML Test Data:")
print(df_ml.head())
print(f"\nEmployment rate: {df_ml['employed'].mean():.2%}")

ML Test Data:
         age        income  education  experience region  employed  \
0  45.960570  68892.663213  16.798711   21.226889  South         1   
1  38.340828  83641.249607  15.849267   10.590514   East         1   
2  47.772262  34021.486393  14.119261    8.454409  South         1   
3  58.276358  63444.538550  12.706126   14.973004   East         1   
4  37.190160  45240.361463  15.396447   13.638523   East         1   

          salary  
0  155050.032536  
1  136669.843126  
2  102326.302047  
3  137050.389318  
4  117866.311441  

Employment rate: 99.20%


In [32]:
# Test 1: Classification Model
print("=" * 60)
print("TEST 1: Random Forest Classification")
print("=" * 60)

ml_clf = train_model(
    df_ml,
    outcome='employed',
    features=['age', 'income', 'education', 'experience', 'region'],
    model_type='random_forest',
    task='classification',
    n_estimators=100,
    random_state=42
)

print(ml_clf.summary())

# Feature importance
feat_imp = ml_clf.get_feature_importance()
print("\nTop 5 Most Important Features:")
print(feat_imp.head())

TEST 1: Random Forest Classification
ML Model Summary

Task: classification
Outcome: employed
Features: 5
Train size: 400, Test size: 100

Training Performance:
  accuracy: 1.0000
  precision: 1.0000
  recall: 1.0000
  f1: 1.0000
  auc: 1.0000

Test Performance:
  accuracy: 1.0000
  precision: 1.0000
  recall: 1.0000
  f1: 1.0000
  auc: nan

Cross-validation: 0.9900 (+/- 0.0050)


Top 5 Most Important Features:
       feature  importance
0          age    0.319676
3   experience    0.229493
1       income    0.207631
2    education    0.163464
7  region_West    0.030101


In [33]:
# Test 2: Regression Model
print("=" * 60)
print("TEST 2: Random Forest Regression")
print("=" * 60)

ml_reg = train_model(
    df_ml,
    outcome='salary',
    features=['age', 'income', 'education', 'experience'],
    model_type='random_forest',
    task='regression',
    n_estimators=100,
    random_state=42
)

print(ml_reg.summary())

# Feature importance
feat_imp_reg = ml_reg.get_feature_importance()
print("\nFeature Importance:")
print(feat_imp_reg)

TEST 2: Random Forest Regression
ML Model Summary

Task: regression
Outcome: salary
Features: 4
Train size: 400, Test size: 100

Training Performance:
  r2: 0.9667
  rmse: 2628.1120
  mae: 2087.3531

Test Performance:
  r2: 0.7726
  rmse: 7003.3384
  mae: 5709.2503

Cross-validation: 0.7453 (+/- 0.0135)


Feature Importance:
      feature  importance
3  experience    0.339882
1      income    0.320233
0         age    0.188097
2   education    0.151788


<a id='stata'></a>
# 8. Stata .dta Files with Categorical Variables

Loading and working with .dta files that have ordered and unordered categorical variables.

### 8.1 Create and Load .dta File

Create test data with ordered and unordered categorical variables, save as .dta, and reload.

In [34]:
import tempfile
from pathlib import Path
from sociopathit.data.loading import load_stata

# Create test data with categorical variables
print("=" * 60)
print("Creating test .dta file with categorical variables")
print("=" * 60)

np.random.seed(42)
n = 300

# Create dataframe with various categorical types
df_stata = pd.DataFrame({
    'id': range(1, n+1),
    'age': np.random.normal(45, 12, n),
    'income': np.random.normal(60000, 20000, n),
    # Ordered categorical: education level
    'education': np.random.choice(['Less than HS', 'High School', 'Some College', 'Bachelor', 'Graduate'], n),
    # Unordered categorical: region
    'region': np.random.choice(['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'], n),
    # Binary categorical: gender
    'gender': np.random.choice(['Male', 'Female'], n),
    # Ordered categorical: satisfaction
    'satisfaction': np.random.choice(['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied'], n),
    # Numeric outcome (will be overwritten)
    'outcome': np.random.normal(100, 15, n),
})

# Convert to categorical with proper ordering
education_order = ['Less than HS', 'High School', 'Some College', 'Bachelor', 'Graduate']
satisfaction_order = ['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied']

df_stata['education'] = pd.Categorical(df_stata['education'], categories=education_order, ordered=True)
df_stata['satisfaction'] = pd.Categorical(df_stata['satisfaction'], categories=satisfaction_order, ordered=True)
df_stata['region'] = pd.Categorical(df_stata['region'], ordered=False)
df_stata['gender'] = pd.Categorical(df_stata['gender'], ordered=False)

# Add some relationship to outcome based on education (use .cat.codes for numeric conversion)
df_stata['outcome'] = (
    80 + 
    10 * df_stata['education'].cat.codes +  # Uses 0-4 for education levels
    0.0003 * df_stata['income'] +
    0.2 * df_stata['age'] +
    np.random.normal(0, 10, n)
)

# Create temporary directory for test file
temp_dir = Path(tempfile.gettempdir()) / 'sociopathit_tests'
temp_dir.mkdir(exist_ok=True)
dta_path = temp_dir / 'test_survey.dta'

# Save as Stata file
try:
    df_stata.to_stata(dta_path, write_index=False, version=118)
    print(f"\nSaved test .dta file to: {dta_path}")
except Exception as e:
    print(f"\nError saving .dta file: {e}")
    print("This is expected if you don't have write permissions to temp directory.")
    # Create in current directory as fallback
    dta_path = Path('.') / 'test_survey.dta'
    df_stata.to_stata(dta_path, write_index=False, version=118)
    print(f"Saved to current directory instead: {dta_path}")

# Load the Stata file with categorical conversion
df_loaded = load_stata(dta_path, convert_categoricals=True)

print("\n" + "=" * 60)
print("Loaded .dta file - checking categorical variables")
print("=" * 60)
print(f"\nDataFrame shape: {df_loaded.shape}")
print(f"\nCategorical columns:")
for col in df_loaded.select_dtypes(include='category').columns:
    cat_info = df_loaded[col].dtype
    print(f"  {col}: ordered={cat_info.ordered}")
    print(f"    Categories ({len(cat_info.categories)}): {list(cat_info.categories)[:3]}...")

print("\nFirst 5 rows:")
print(df_loaded.head())

Creating test .dta file with categorical variables

Saved test .dta file to: C:\Users\alecw\AppData\Local\Temp\sociopathit_tests\test_survey.dta

Loaded .dta file - checking categorical variables

DataFrame shape: (300, 8)

Categorical columns:
  education: ordered=True
    Categories (5): ['Less than HS', 'High School', 'Some College']...
  region: ordered=True
    Categories (5): ['Midwest', 'Northeast', 'Southeast']...
  gender: ordered=True
    Categories (2): ['Female', 'Male']...
  satisfaction: ordered=True
    Categories (5): ['Very Dissatisfied', 'Dissatisfied', 'Neutral']...

First 5 rows:
   id        age        income     education     region  gender  \
0   1  50.960570  43420.099782      Bachelor    Midwest    Male   
1   2  43.340828  48796.379196      Bachelor  Southeast  Female   
2   3  52.772262  74945.872102      Graduate  Northeast    Male   
3   4  63.276358  72207.405309   High School       West    Male   
4   5  42.190160  59581.968121  Some College  Southeast  F

In [35]:
# Test 1: Crosstab with Categorical Variables from .dta
print("=" * 60)
print("TEST 1: Crosstab with .dta Categorical Variables")
print("=" * 60)

# Import descriptive module
from sociopathit.analyses.descriptive import crosstab, group_summary

# Test crosstab with unordered categoricals
result_crosstab = crosstab(
    df_loaded,
    row_var='gender',
    col_var='region',
    show_chi2=True,
    show_effect_size=True
)

print("\nGender x Region Crosstab:")
print(result_crosstab['table'])
print(f"\nChi-square: {result_crosstab['chi2']:.3f}, p-value: {result_crosstab['p_value']:.3f}")
print(f"Cramér's V: {result_crosstab['cramers_v']:.3f}")

# Test with ordered categorical
result_crosstab2 = crosstab(
    df_loaded,
    row_var='education',
    col_var='satisfaction',
    normalize='index',
    show_chi2=True
)

print("\n\nEducation x Satisfaction Crosstab (row percentages):")
print(result_crosstab2['proportions'].round(3))
print(f"\nChi-square: {result_crosstab2['chi2']:.3f}, p-value: {result_crosstab2['p_value']:.3f}")

TEST 1: Crosstab with .dta Categorical Variables

Gender x Region Crosstab:
region  Midwest  Northeast  Southeast  Southwest  West
gender                                                
Female       28         36         39         26    34
Male         36         20         23         25    33

Chi-square: 7.538, p-value: 0.110
Cramér's V: 0.159


Education x Satisfaction Crosstab (row percentages):
satisfaction  Very Dissatisfied  Dissatisfied  Neutral  Satisfied  \
education                                                           
Less than HS              0.200         0.231    0.138      0.215   
High School               0.180         0.164    0.230      0.230   
Some College              0.215         0.185    0.154      0.215   
Bachelor                  0.208         0.226    0.132      0.245   
Graduate                  0.232         0.304    0.161      0.107   

satisfaction  Very Satisfied  
education                     
Less than HS           0.215  
High School        

### 8.2 Crosstabs with Categorical Variables

In [36]:
# Test 2: Grouped Summary by Categorical Variables
print("=" * 60)
print("TEST 2: Grouped Summary by Categorical (.dta)")
print("=" * 60)

# Group by education level (ordered categorical)
grouped_by_edu = group_summary(
    df_loaded,
    variables=['income', 'age', 'outcome'],
    group_var='education',
    stats=['count', 'mean', 'std']
)

print("\nSummary Statistics by Education Level:")
print(grouped_by_edu.round(2))

# Group by region (unordered categorical)
grouped_by_region = group_summary(
    df_loaded,
    variables='outcome',
    group_var='region',
    stats=['count', 'mean', 'std', 'min', 'max']
)

print("\n\nOutcome by Region:")
print(grouped_by_region.round(2))

TEST 2: Grouped Summary by Categorical (.dta)

Summary Statistics by Education Level:
      education  income_count  income_mean  income_std  age_count  age_mean  \
0  Less than HS            65     58725.30    19635.03         65     44.07   
1   High School            61     57881.61    17954.95         61     48.34   
2  Some College            65     64358.65    20994.47         65     44.01   
3      Bachelor            53     59739.84    17938.75         53     45.07   
4      Graduate            56     56671.67    18806.55         56     43.17   

   age_std  outcome_count  outcome_mean  outcome_std  
0    12.24             65        105.81         9.99  
1    12.08             61        118.11        12.49  
2    12.46             65        129.15        10.93  
3    10.46             53        138.33        11.49  
4    11.08             56        143.57        10.03  


Outcome by Region:
      region outcome                              
               count    mean    std  

### 8.3 Grouped Summary Statistics

In [37]:
# Test 3: Regression with Categorical Variables from .dta
print("=" * 60)
print("TEST 3: OLS Regression with Categorical Predictors (.dta)")
print("=" * 60)

# Prepare data: convert categorical to dummy variables for regression
df_reg = df_loaded.copy()

# Get numeric columns and categorical columns
numeric_cols = df_reg.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numeric_cols:
    numeric_cols.remove('id')

# Convert categoricals to dummies
cat_cols = df_reg.select_dtypes(include='category').columns.tolist()
df_reg_dummies = pd.get_dummies(df_reg, columns=cat_cols, drop_first=True, dtype=int)

print(f"\nOriginal columns: {len(df_loaded.columns)}")
print(f"After dummy encoding: {len(df_reg_dummies.columns)}")

# Fit regression with categorical dummies
predictor_cols = [col for col in df_reg_dummies.columns 
                  if col not in ['outcome', 'id']]

from sociopathit.analyses.regress import ols

model_cat = ols(
    df_reg_dummies,
    outcome='outcome',
    inputs=predictor_cols,
    robust=True
)

print("\nRegression Results (with categorical dummies):")
results_tidy = model_cat.get_tidy()
print(results_tidy[results_tidy['p.value'] < 0.05].round(4))

stats = model_cat.get_stats()
print(f"\nR-squared: {stats['R_squared']:.4f}")
print(f"Adj R-squared: {stats['Adj_R_squared']:.4f}")
print(f"N: {stats['N']}")

TEST 3: OLS Regression with Categorical Predictors (.dta)

Original columns: 8
After dummy encoding: 17

Regression Results (with categorical dummies):
                         term  estimate  std.error  statistic  p.value  \
0                       const   82.2721     3.5161    23.3990   0.0000   
1                         age    0.2059     0.0553     3.7250   0.0002   
2                      income    0.0002     0.0000     8.5545   0.0000   
3       education_High School   11.7438     1.6665     7.0470   0.0000   
4      education_Some College   21.9451     1.7440    12.5832   0.0000   
5          education_Bachelor   32.1642     1.7392    18.4932   0.0000   
6          education_Graduate   38.3324     1.6589    23.1069   0.0000   
12  satisfaction_Dissatisfied    3.3896     1.6961     1.9984   0.0457   

    conf.low  conf.high  
0    75.3807    89.1634  
1     0.0976     0.3143  
2     0.0002     0.0003  
3     8.4775    15.0101  
4    18.5269    25.3632  
5    28.7553    35.5730  

### 8.4 Regression with Categorical Dummies

In [38]:
# Test 4: Machine Learning with Categorical Variables from .dta
print("=" * 60)
print("TEST 4: ML with Categorical Features (.dta)")
print("=" * 60)

# ML automatically handles categoricals through pipeline
from sociopathit.analyses.ml import train_model

# Convert outcome to binary for classification
df_ml_cat = df_loaded.copy()
median_outcome = df_ml_cat['outcome'].median()
df_ml_cat['high_outcome'] = (df_ml_cat['outcome'] > median_outcome).astype(int)

# Convert categorical columns to string for sklearn compatibility
cat_features = []
for col in df_ml_cat.select_dtypes(include='category').columns:
    df_ml_cat[col] = df_ml_cat[col].astype(str)
    cat_features.append(col)

numeric_features = ['age', 'income']

print(f"\nCategorical features: {cat_features}")
print(f"Numeric features: {numeric_features}")

# Train classification model
ml_model_cat = train_model(
    df_ml_cat,
    outcome='high_outcome',
    features=numeric_features + cat_features,
    model_type='random_forest',
    task='classification',
    n_estimators=50,
    random_state=42
)

print("\n" + ml_model_cat.summary())

# Get feature importance
feat_imp_cat = ml_model_cat.get_feature_importance()
print("\nTop 10 Most Important Features:")
print(feat_imp_cat.head(10))

TEST 4: ML with Categorical Features (.dta)

Categorical features: ['education', 'region', 'gender', 'satisfaction']
Numeric features: ['age', 'income']

ML Model Summary

Task: classification
Outcome: high_outcome
Features: 6
Train size: 240, Test size: 60

Training Performance:
  accuracy: 1.0000
  precision: 1.0000
  recall: 1.0000
  f1: 1.0000
  auc: 1.0000

Test Performance:
  accuracy: 0.7833
  precision: 0.7914
  recall: 0.7833
  f1: 0.7818
  auc: 0.8800

Cross-validation: 0.8250 (+/- 0.0583)


Top 10 Most Important Features:
                   feature  importance
1                   income    0.191933
5   education_Less than HS    0.170321
0                      age    0.139423
3       education_Graduate    0.111710
4    education_High School    0.076450
2       education_Bachelor    0.061647
6   education_Some College    0.031224
15    satisfaction_Neutral    0.026503
8         region_Northeast    0.024746
12           gender_Female    0.022285


### 8.5 Machine Learning with Categorical Features

In [39]:
# Test 5: Publication Tables with Categorical Variables from .dta
print("=" * 60)
print("TEST 5: Publication Tables with Categorical Data (.dta)")
print("=" * 60)

from sociopathit.analyses.pubtable import proportion_table, descriptive_table
from IPython.display import HTML, display

# Proportion table for ordered categorical
html_prop = proportion_table(
    df=df_loaded,
    row_var='education',
    col_var='satisfaction',
    title='Education Level by Satisfaction (from .dta)',
    decimals=1,
    show_n=True
)

print("\nEducation x Satisfaction Table:")
display(HTML(html_prop))

# Descriptive table grouped by categorical
html_desc = descriptive_table(
    df=df_loaded,
    variables=['age', 'income', 'outcome'],
    group_var='region',
    stats=['mean', 'sd', 'n'],
    decimals=2,
    title='Demographics by Region (from .dta)'
)

print("\nDescriptive Statistics by Region:")
display(HTML(html_desc))

TEST 5: Publication Tables with Categorical Data (.dta)

Education x Satisfaction Table:


education,Very Dissatisfied,Dissatisfied,Neutral,Satisfied,Very Satisfied
Less than HS,21.0,22.7,18.4,23.0,22.6
High School,17.7,15.2,28.6,23.0,19.4
Some College,22.6,18.2,20.4,23.0,24.2
Bachelor,17.7,18.2,14.3,21.3,16.1
Graduate,21.0,25.8,18.4,9.8,17.7



Descriptive Statistics by Region:


Variable,Group,Mean,Sd,N
age,Midwest,45.3,12.12,64
,Northeast,46.58,13.91,56
,Southeast,45.12,11.01,62
,Southwest,44.13,9.63,51
,West,43.65,11.54,67
income,Midwest,64999.19,22070.49,64
,Northeast,60136.71,19727.56,56
,Southeast,60134.96,15395.76,62
,Southwest,54361.41,21306.41,51
,West,57353.08,15521.47,67


### 8.6 Publication Tables with Categorical Data

<a id='textanalysis'></a>
# 9. Text Analysis

Text preprocessing, complexity metrics, similarity, topic modeling with the `text_analysis` module.

In [40]:
import importlib
from sociopathit.analyses import text_analysis as text_module

importlib.reload(text_module)
from sociopathit.analyses.text_analysis import (
    clean_text, tokenize, complexity_scores, 
    jaccard_similarity, create_tfidf_matrix, TopicModel,
    extract_ngrams, ngram_frequency, analyze_corpus,
    SentimentAnalyzer, BERTModel, OllamaClassifier,
    HAS_TRANSFORMERS, HAS_REQUESTS, HAS_SKLEARN
)

# Create sample texts for testing
sample_texts = [
    "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing text analysis.",
    "Natural language processing is an exciting field of study. It combines linguistics and computer science.",
    "Machine learning algorithms can analyze large amounts of text data. They identify patterns and extract insights.",
    "Text mining techniques help researchers discover hidden patterns. Statistical methods are commonly used.",
    "Data science involves collecting, processing, and analyzing data. Python is a popular programming language for data analysis."
]

print("Sample Texts Generated:")
print(f"Number of texts: {len(sample_texts)}")
print(f"\nFirst text: {sample_texts[0][:100]}...")
print(f"\nDependencies available:")
print(f"  - Transformers: {HAS_TRANSFORMERS}")
print(f"  - Requests (for Ollama): {HAS_REQUESTS}")
print(f"  - Scikit-learn: {HAS_SKLEARN}")

Sample Texts Generated:
Number of texts: 5

First text: The quick brown fox jumps over the lazy dog. This is a simple sentence for testing text analysis....

Dependencies available:
  - Transformers: True
  - Requests (for Ollama): True
  - Scikit-learn: True


In [41]:
# Test 3: Text Similarity
print("=" * 60)
print("TEST 3: Text Similarity")
print("=" * 60)

text1 = "The cat sat on the mat"
text2 = "The dog sat on the rug"
text3 = "Machine learning is fascinating"

jac_sim_12 = jaccard_similarity(text1, text2)
jac_sim_13 = jaccard_similarity(text1, text3)

print(f"\nText 1: {text1}")
print(f"Text 2: {text2}")
print(f"Text 3: {text3}")
print(f"\nJaccard Similarity (Text 1 vs Text 2): {jac_sim_12:.3f}")
print(f"Jaccard Similarity (Text 1 vs Text 3): {jac_sim_13:.3f}")

TEST 3: Text Similarity

Text 1: The cat sat on the mat
Text 2: The dog sat on the rug
Text 3: Machine learning is fascinating

Jaccard Similarity (Text 1 vs Text 2): 0.429
Jaccard Similarity (Text 1 vs Text 3): 0.000


In [42]:
# Test 6: N-gram Analysis
print("=" * 60)
print("TEST 6: N-gram Analysis")
print("=" * 60)

text_for_ngrams = "machine learning and data science are exciting fields of study"

# Extract bigrams
bigrams = extract_ngrams(text_for_ngrams, n=2, top_k=5)
print(f"\nText: {text_for_ngrams}")
print(f"\nTop 5 Bigrams:")
for ngram, count in bigrams:
    print(f"  {' '.join(ngram)}: {count}")

# N-gram frequency across multiple texts
ngram_freq = ngram_frequency(sample_texts, n=2, top_k=10)
print(f"\nTop 10 Bigrams Across All Texts:")
print(ngram_freq)

TEST 6: N-gram Analysis

Text: machine learning and data science are exciting fields of study

Top 5 Bigrams:
  machine learning: 1
  learning and: 1
  and data: 1
  data science: 1
  science are: 1

Top 10 Bigrams Across All Texts:
            ngram  frequency
0         (is, a)          2
1    (the, quick)          1
2  (quick, brown)          1
3    (brown, fox)          1
4    (fox, jumps)          1
5   (jumps, over)          1
6     (over, the)          1
7     (the, lazy)          1
8     (lazy, dog)          1
9     (dog, this)          1


In [43]:
# Test 6: Correlation Network
print("=" * 60)
print("TEST 6: Correlation Network")
print("=" * 60)

from sociopathit.analyses.network import correlation_network, network_summary

# Use existing data
corr_network = correlation_network(
    df_desc,
    variables=['age', 'income', 'education', 'satisfaction'],
    method='pearson',
    threshold=0.3,
    absolute=False
)

print("\nCorrelation Network (threshold=0.3):")
print(corr_network)

# Summary
if len(corr_network) > 0:
    corr_stats = network_summary(corr_network, directed=False)
    print("\nNetwork Statistics:")
    for key, value in corr_stats.items():
        print(f"  {key}: {value}")

TEST 6: Correlation Network

Correlation Network (threshold=0.3):
   source        target    weight  correlation
0  income     education  0.408189     0.408189
1  income  satisfaction  0.364734     0.364734

Network Statistics:
  n_nodes: 3
  n_edges: 2
  density: 0.3333333333333333
  avg_degree: 1.3333333333333333
  max_degree: 2
  min_degree: 1
  total_weight: 0.7729233321616
  avg_weight: 0.3864616660808


In [44]:
# Test 5: Similarity-based Network
print("=" * 60)
print("TEST 5: Similarity-based Network")
print("=" * 60)

# Create feature data
feature_data = pd.DataFrame({
    'id': ['Item1', 'Item2', 'Item3', 'Item4'],
    'feature1': [1.0, 2.0, 1.5, 3.0],
    'feature2': [2.0, 3.0, 2.5, 4.0],
    'feature3': [1.5, 1.0, 1.2, 2.0]
})

print("\nFeature Data:")
print(feature_data)

try:
    from sociopathit.analyses.network import similarity_network
    
    sim_network = similarity_network(
        feature_data,
        features=['feature1', 'feature2', 'feature3'],
        node_id_col='id',
        similarity_metric='cosine',
        threshold=0.9
    )
    
    print("\nSimilarity Network (cosine, threshold=0.9):")
    print(sim_network)
except ImportError as e:
    print(f"\nSkipping similarity network test: {e}")
    print("scikit-learn is required for this feature")

TEST 5: Similarity-based Network

Feature Data:
      id  feature1  feature2  feature3
0  Item1       1.0       2.0       1.5
1  Item2       2.0       3.0       1.0
2  Item3       1.5       2.5       1.2
3  Item4       3.0       4.0       2.0

Similarity Network (cosine, threshold=0.9):
  source target    weight
0  Item1  Item2  0.942954
1  Item1  Item3  0.977723
2  Item1  Item4  0.965517
3  Item2  Item3  0.991810
4  Item2  Item4  0.992583
5  Item3  Item4  0.995393


In [45]:
# Test 4: Bipartite Network
print("=" * 60)
print("TEST 4: Bipartite Network and Projection")
print("=" * 60)

from sociopathit.analyses.network import create_bipartite_edgelist, project_bipartite

# Create bipartite edge list (users and products)
user_product = pd.DataFrame({
    'user': ['User1', 'User1', 'User2', 'User2', 'User3', 'User3', 'User4'],
    'product': ['ProductA', 'ProductB', 'ProductA', 'ProductC', 'ProductB', 'ProductC', 'ProductA'],
    'rating': [5, 4, 3, 5, 4, 2, 5]
})

print("\nUser-Product Data:")
print(user_product)

bipartite_edges = create_bipartite_edgelist(
    user_product,
    node_type1_col='user',
    node_type2_col='product',
    weight_col='rating'
)

print("\nBipartite Edge List:")
print(bipartite_edges)

# Project onto users
user_network = project_bipartite(bipartite_edges, project_on='type1', weight_method='simple')

print("\nUser-User Network (shared products):")
print(user_network)

TEST 4: Bipartite Network and Projection

User-Product Data:
    user   product  rating
0  User1  ProductA       5
1  User1  ProductB       4
2  User2  ProductA       3
3  User2  ProductC       5
4  User3  ProductB       4
5  User3  ProductC       2
6  User4  ProductA       5

Bipartite Edge List:
  type1_node type2_node  weight
0      User1   ProductA       5
1      User1   ProductB       4
2      User2   ProductA       3
3      User2   ProductC       5
4      User3   ProductB       4
5      User3   ProductC       2
6      User4   ProductA       5

User-User Network (shared products):
  source target  weight  shared_neighbors
0  User1  User2       1                 1
1  User1  User3       1                 1
2  User1  User4       1                 1
3  User2  User3       1                 1
4  User2  User4       1                 1


In [46]:
# Test 3: Co-occurrence Network
print("=" * 60)
print("TEST 3: Co-occurrence Network")
print("=" * 60)

from sociopathit.analyses.network import cooccurrence_network, network_summary

# Create co-occurrence data (e.g., actors in movies)
movies_data = pd.DataFrame({
    'movie': ['Movie1', 'Movie1', 'Movie1', 'Movie2', 'Movie2', 'Movie3', 'Movie3', 'Movie3'],
    'actor': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Bob', 'Charlie', 'David']
})

print("\nMovies Data:")
print(movies_data)

cooccur_net = cooccurrence_network(
    movies_data,
    item_col='actor',
    group_col='movie',
    min_cooccurrence=1
)

print("\nCo-occurrence Network:")
print(cooccur_net)

# Network summary
net_stats = network_summary(cooccur_net, directed=False)
print("\nNetwork Statistics:")
for key, value in net_stats.items():
    print(f"  {key}: {value}")

TEST 3: Co-occurrence Network

Movies Data:
    movie    actor
0  Movie1    Alice
1  Movie1      Bob
2  Movie1  Charlie
3  Movie2    Alice
4  Movie2      Bob
5  Movie3      Bob
6  Movie3  Charlie
7  Movie3    David

Co-occurrence Network:
    source   target  weight  cooccurrence_count
0      Bob  Charlie       2                   2
1    Alice      Bob       2                   2
2    Alice  Charlie       1                   1
3      Bob    David       1                   1
4  Charlie    David       1                   1

Network Statistics:
  n_nodes: 4
  n_edges: 5
  density: 0.4166666666666667
  avg_degree: 2.5
  max_degree: 3
  min_degree: 2
  total_weight: 7
  avg_weight: 1.4


In [47]:
# Test 1 & 2: Create Edge List and Adjacency Matrix
print("=" * 60)
print("TEST 1: Create Edge List from DataFrame")
print("=" * 60)

# Create sample interaction data
np.random.seed(42)
interactions = pd.DataFrame({
    'person_a': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'Alice'],
    'person_b': ['Bob', 'Charlie', 'Charlie', 'Alice', 'Alice', 'Bob'],
    'strength': [5, 3, 2, 4, 6, 1]
})

print("\nInteraction Data:")
print(interactions)

# Ensure import
from sociopathit.analyses.network import create_edgelist, create_adjacency_matrix, adjacency_to_edgelist

# Create directed edge list
edgelist_directed = create_edgelist(
    interactions, 
    source_col='person_a',
    target_col='person_b',
    weight_col='strength',
    directed=True
)

print("\nDirected Edge List:")
print(edgelist_directed)

# Create undirected edge list
edgelist_undirected = create_edgelist(
    interactions,
    source_col='person_a',
    target_col='person_b',
    weight_col='strength',
    directed=False
)

print("\nUndirected Edge List:")
print(edgelist_undirected)

# TEST 2: Adjacency Matrix
print("\n" + "=" * 60)
print("TEST 2: Create Adjacency Matrix")
print("=" * 60)

adj_matrix = create_adjacency_matrix(edgelist_directed, weighted=True)

print("\nAdjacency Matrix (weighted):")
print(adj_matrix)

# Convert back to edge list
edgelist_from_adj = adjacency_to_edgelist(adj_matrix, directed=True)

print("\nEdge List from Adjacency Matrix:")
print(edgelist_from_adj)

TEST 1: Create Edge List from DataFrame

Interaction Data:
  person_a person_b  strength
0    Alice      Bob         5
1      Bob  Charlie         3
2    Alice  Charlie         2
3  Charlie    Alice         4
4      Bob    Alice         6
5    Alice      Bob         1

Directed Edge List:
    source   target  weight
0    Alice      Bob       6
1    Alice  Charlie       2
2      Bob    Alice       6
3      Bob  Charlie       3
4  Charlie    Alice       4

Undirected Edge List:
  source   target  weight
0  Alice      Bob      12
1  Alice  Charlie       6
2    Bob  Charlie       3

TEST 2: Create Adjacency Matrix

Adjacency Matrix (weighted):
         Alice  Bob  Charlie
Alice      0.0  6.0      2.0
Bob        6.0  0.0      3.0
Charlie    4.0  0.0      0.0

Edge List from Adjacency Matrix:
    source   target  weight
0    Alice      Bob     6.0
1    Alice  Charlie     2.0
2      Bob    Alice     6.0
3      Bob  Charlie     3.0
4  Charlie    Alice     4.0


In [48]:
import importlib
from sociopathit.analyses import network as network_module

importlib.reload(network_module)
from sociopathit.analyses.network import (
    create_edgelist, create_adjacency_matrix, cooccurrence_network,
    create_bipartite_edgelist, project_bipartite, similarity_network,
    correlation_network, network_summary
)

print("Network Module Loaded Successfully")

Network Module Loaded Successfully


In [49]:
# Test 7: Sentiment Analysis (Lexicon-based)
print("=" * 60)
print("TEST 7: Sentiment Analysis - Lexicon Method")
print("=" * 60)

test_texts = [
    "This is an amazing and wonderful product! I love it!",
    "This is terrible, awful, and horrible. I hate it.",
    "This is okay. Nothing special.",
    "The weather today is nice."
]

try:
    analyzer = SentimentAnalyzer(method='lexicon')
    
    for text in test_texts:
        result = analyzer.analyze(text)
        print(f"\nText: {text}")
        print(f"Sentiment: {result['label']} (score: {result['score']:.3f})")
    
    print("\n✓ Lexicon-based sentiment analysis tests passed")
    
except Exception as e:
    print(f"✗ Lexicon-based sentiment analysis test failed: {e}")

TEST 7: Sentiment Analysis - Lexicon Method

Text: This is an amazing and wonderful product! I love it!
Sentiment: POSITIVE (score: 0.300)

Text: This is terrible, awful, and horrible. I hate it.
Sentiment: NEGATIVE (score: 0.444)

Text: This is okay. Nothing special.
Sentiment: NEUTRAL (score: 0.000)

Text: The weather today is nice.
Sentiment: NEUTRAL (score: 0.000)

✓ Lexicon-based sentiment analysis tests passed


In [50]:
# Test 8: Sentiment Analysis (Transformer-based)
print("=" * 60)
print("TEST 8: Sentiment Analysis - Transformer Method")
print("=" * 60)

if not HAS_TRANSFORMERS:
    print("⚠ Transformers library not available. Skipping transformer tests.")
    print("  Install with: pip install transformers torch")
else:
    test_texts = [
        "This is an amazing and wonderful product! I love it!",
        "This is terrible and disappointing. Very bad experience.",
        "The item is okay, nothing particularly special."
    ]
    
    try:
        print("Loading sentiment analysis model (this may take a moment)...")
        analyzer = SentimentAnalyzer(method='transformer')
        
        print("\nAnalyzing texts...")
        for text in test_texts:
            result = analyzer.analyze(text)
            print(f"\nText: {text}")
            print(f"Sentiment: {result['label']} (confidence: {result['score']:.3f})")
        
        # Test batch processing
        print("\n\nTesting batch processing...")
        results = analyzer.analyze(test_texts)
        print(f"Processed {len(results)} texts in batch")
        
        print("\n✓ Transformer-based sentiment analysis tests passed")
        
    except Exception as e:
        print(f"✗ Transformer-based sentiment analysis test failed: {e}")

TEST 8: Sentiment Analysis - Transformer Method
Loading sentiment analysis model (this may take a moment)...


Device set to use cpu



Analyzing texts...

Text: This is an amazing and wonderful product! I love it!
Sentiment: POSITIVE (confidence: 1.000)

Text: This is terrible and disappointing. Very bad experience.
Sentiment: NEGATIVE (confidence: 1.000)

Text: The item is okay, nothing particularly special.
Sentiment: NEGATIVE (confidence: 0.970)


Testing batch processing...
Processed 3 texts in batch

✓ Transformer-based sentiment analysis tests passed


In [51]:
# Test 9: BERT Embeddings
print("=" * 60)
print("TEST 9: BERT Embeddings")
print("=" * 60)

if not HAS_TRANSFORMERS:
    print("⚠ Transformers library not available. Skipping BERT tests.")
    print("  Install with: pip install transformers torch")
else:
    test_texts = [
        "The cat is sleeping on the couch.",
        "A feline is resting on the sofa.",
        "Dogs are playing in the park."
    ]
    
    try:
        print("Loading BERT model (this may take a moment)...")
        bert_model = BERTModel(model_name='bert-base-uncased', task='embedding')
        
        print("\nGenerating embeddings...")
        embeddings = bert_model.get_embeddings(test_texts, pooling='mean')
        print(f"Embeddings shape: {embeddings.shape}")
        print(f"First embedding (first 10 dims): {embeddings[0][:10]}")
        
        # Test similarity
        print("\n\nTesting semantic similarity...")
        sim_12 = bert_model.similarity(test_texts[0], test_texts[1])
        sim_13 = bert_model.similarity(test_texts[0], test_texts[2])
        
        print(f"Similarity between texts 1 and 2 (similar meaning): {sim_12:.3f}")
        print(f"Similarity between texts 1 and 3 (different meaning): {sim_13:.3f}")
        
        if sim_12 > sim_13:
            print("✓ BERT correctly identified similar texts as more similar")
        else:
            print("⚠ Unexpected similarity scores")
        
        print("\n✓ BERT embeddings tests passed")
        
    except Exception as e:
        print(f"✗ BERT embeddings test failed: {e}")

TEST 9: BERT Embeddings
Loading BERT model (this may take a moment)...

Generating embeddings...
Embeddings shape: (3, 768)
First embedding (first 10 dims): [ 0.0578401  -0.27792972 -0.01247833  0.07429158  0.5953165  -0.7032491
 -0.2844429   0.5824173  -0.10914177 -0.353179  ]


Testing semantic similarity...
Similarity between texts 1 and 2 (similar meaning): 0.839
Similarity between texts 1 and 3 (different meaning): 0.801
✓ BERT correctly identified similar texts as more similar

✓ BERT embeddings tests passed


In [52]:
# Test 10: Ollama Integration
print("=" * 60)
print("TEST 10: Ollama Integration")
print("=" * 60)

if not HAS_REQUESTS:
    print("⚠ Requests library not available. Skipping Ollama tests.")
    print("  Install with: pip install requests")
else:
    print("⚠ Note: Ollama tests require a running Ollama server at localhost:11434")
    print("   If Ollama is not running, these tests will be skipped.")
    
    try:
        classifier = OllamaClassifier(model='llama2')
        
        # Quick test to see if Ollama is available
        print("\nTesting Ollama connection...")
        test_result = classifier.classify(
            text="This is a test.",
            categories=["test", "production"]
        )
        
        print("✓ Ollama is available!")
        
        # Test classification
        print("\n\nTesting text classification...")
        test_texts = [
            "I love this product! It's amazing!",
            "This is terrible and doesn't work at all.",
            "It's okay, nothing special."
        ]
        
        categories = ["positive", "negative", "neutral"]
        
        for text in test_texts:
            result = classifier.classify(text, categories)
            print(f"\nText: {text}")
            print(f"Category: {result['category']}")
        
        # Test entity extraction
        print("\n\nTesting entity extraction...")
        text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
        entities = classifier.extract_entities(text)
        print(f"\nText: {text}")
        print("Entities:")
        for entity in entities:
            print(f"  - {entity['text']} ({entity['type']})")
        
        # Test summarization
        print("\n\nTesting summarization...")
        long_text = """
        Machine learning is a subset of artificial intelligence that focuses on the
        development of algorithms and statistical models that enable computers to
        improve their performance on tasks through experience. Instead of being
        explicitly programmed, machine learning systems learn from data.
        """
        summary = classifier.summarize(long_text, max_length=30)
        print(f"\nOriginal length: {len(long_text.split())} words")
        print(f"Summary: {summary}")
        print(f"Summary length: {len(summary.split())} words")
        
        print("\n✓ Ollama integration tests passed")
        
    except RuntimeError as e:
        if "Ollama API call failed" in str(e):
            print(f"\n⚠ Ollama server not available. Skipping Ollama tests.")
            print("   To test Ollama functionality, start the Ollama server and run again.")
        else:
            print(f"\n✗ Ollama integration test failed: {e}")
    except Exception as e:
        print(f"\n✗ Ollama integration test failed: {e}")

TEST 10: Ollama Integration
⚠ Note: Ollama tests require a running Ollama server at localhost:11434
   If Ollama is not running, these tests will be skipped.

Testing Ollama connection...

⚠ Ollama server not available. Skipping Ollama tests.
   To test Ollama functionality, start the Ollama server and run again.


<a id='network'></a>
# 10. Network Data Preparation

Edge lists, adjacency matrices, co-occurrence networks, bipartite networks with the `network` module.

In [53]:
# Test 5: Topic Modeling
print("=" * 60)
print("TEST 5: Topic Modeling (LDA)")
print("=" * 60)

try:
    # Create more texts for better topic modeling
    extended_texts = sample_texts + [
        "Python programming language is widely used for data analysis and machine learning tasks.",
        "Statistical analysis helps in understanding data patterns and relationships.",
        "Deep learning models require large amounts of training data.",
        "Text classification is an important task in natural language processing.",
        "Data visualization makes it easier to communicate insights from analysis."
    ]
    
    topic_model = TopicModel(n_topics=3, method='lda', random_state=42)
    topic_model.fit(extended_texts)
    
    topics = topic_model.get_topics(n_words=5)
    doc_topics = topic_model.get_document_topics()
    
    print(f"\nTopics (top 5 words each):")
    print(topics)
    
    print(f"\nDocument-Topic Distribution (first 5 documents):")
    print(doc_topics.head())
    
    print(f"\nDominant topics for first 5 documents:")
    print(topic_model.get_dominant_topic().head())
    
except ImportError as e:
    print(f"\nSkipping topic modeling test: {e}")
    print("scikit-learn is required for this feature")

TEST 5: Topic Modeling (LDA)

Topics (top 5 words each):
    Topic 1      Topic 2     Topic 3
0      data         data  processing
1  analysis           of    language
2      text      amounts          is
3       and        large     natural
4       for  statistical          an

Document-Topic Distribution (first 5 documents):
    Topic 1   Topic 2   Topic 3
0  0.962141  0.018852  0.019007
1  0.021977  0.021540  0.956482
2  0.956776  0.022576  0.020647
3  0.950695  0.025135  0.024170
4  0.957404  0.020551  0.022045

Dominant topics for first 5 documents:
0    Topic 1
1    Topic 3
2    Topic 1
3    Topic 1
4    Topic 1
dtype: object


In [54]:
# Test 4: TF-IDF Matrix
print("=" * 60)
print("TEST 4: TF-IDF Vectorization")
print("=" * 60)

try:
    tfidf_df, vectorizer = create_tfidf_matrix(sample_texts, max_features=20)
    
    print(f"\nTF-IDF Matrix Shape: {tfidf_df.shape}")
    print(f"Number of features: {len(vectorizer.get_feature_names_out())}")
    
    # Get top terms overall
    from sociopathit.analyses.text_analysis import get_top_tfidf_terms
    top_terms = get_top_tfidf_terms(tfidf_df, n_terms=10)
    
    print(f"\nTop 10 TF-IDF Terms:")
    for term, score in top_terms.items():
        print(f"  {term}: {score:.3f}")
except ImportError as e:
    print(f"\nSkipping TF-IDF test: {e}")
    print("scikit-learn is required for this feature")

TEST 4: TF-IDF Vectorization

TF-IDF Matrix Shape: (5, 20)
Number of features: 20

Top 10 TF-IDF Terms:
  data: 0.235
  text: 0.181
  and: 0.168
  patterns: 0.162
  of: 0.151
  is: 0.146
  the: 0.140
  language: 0.120
  processing: 0.120
  science: 0.120


In [55]:
# Test 2: Complexity and Readability Scores
print("=" * 60)
print("TEST 2: Complexity and Readability Scores")
print("=" * 60)

test_text = sample_texts[0]
scores = complexity_scores(test_text)

print(f"\nText: {test_text}")
print(f"\nComplexity Metrics:")
for metric, value in scores.items():
    print(f"  {metric}: {value:.2f}")

TEST 2: Complexity and Readability Scores

Text: The quick brown fox jumps over the lazy dog. This is a simple sentence for testing text analysis.

Complexity Metrics:
  flesch_reading_ease: 80.20
  flesch_kincaid_grade: 4.31
  lexical_diversity: 0.94
  avg_word_length: 4.44
  word_count: 18.00
  sentence_count: 3.00


In [56]:
# Test 1: Text Cleaning and Preprocessing
print("=" * 60)
print("TEST 1: Text Cleaning and Preprocessing")
print("=" * 60)

text = "The Quick BROWN Fox!!! Jumps over the lazy dog... 123"
cleaned = clean_text(text, lowercase=True, remove_punctuation=True, remove_numbers=True)
tokens = tokenize(cleaned)

print(f"\nOriginal text: {text}")
print(f"Cleaned text: {cleaned}")
print(f"Tokens: {tokens}")
print(f"Number of tokens: {len(tokens)}")

TEST 1: Text Cleaning and Preprocessing

Original text: The Quick BROWN Fox!!! Jumps over the lazy dog... 123
Cleaned text: the quick brown fox jumps over the lazy dog
Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9
