In [1]:
import pytest
import pandas as pd
import numpy as np

# Loading the simulated data
@pytest.fixture
def simulation_data():
    # Loading data from the CSV file
    data = pd.read_csv("SimulationData.csv")
    return data

#Checking if the dataset has the correct number of rows and columns
def test_data_shape(simulation_data):
    data = simulation_data
    expected_columns = [
        'frontline_prev_best', 'noncombat_prev_best', 'lead_prev_best',
        'frontline', 'noncombat', 'lead', 
        'linear_predictor', 'participation_prob', 'participation'
    ]
    assert data.shape[1] == len(expected_columns), "Unexpected number of columns"
    assert data.shape[0] > 0, "Dataset should have rows"

# Validating column data types
def test_column_data_types(simulation_data):
    data = simulation_data
    expected_types = {
        'frontline_prev_best': 'int64',
        'noncombat_prev_best': 'int64',
        'lead_prev_best': 'int64',
        'frontline': 'int64',
        'noncombat': 'int64',
        'lead': 'int64',
        'linear_predictor': 'float64',
        'participation_prob': 'float64',
        'participation': 'int64',
    }
    for col, dtype in expected_types.items():
        assert data[col].dtype == dtype, f"Column {col} has incorrect type"

# TCheck ranges for ordinal variables
def test_ordinal_variable_ranges(simulation_data):
    data = simulation_data
    for col in ['frontline_prev_best', 'noncombat_prev_best', 'lead_prev_best']:
        assert data[col].between(0, 4).all(), f"{col} values out of range"

# Test 4: Check binary variables are 0 or 1
def test_binary_variables(simulation_data):
    data = simulation_data
    binary_columns = ['frontline', 'noncombat', 'lead', 'participation']
    for col in binary_columns:
        assert data[col].isin([0, 1]).all(), f"{col} has non-binary values"

# Test 5: Validate probabilities are between 0 and 1
def test_probabilities_range(simulation_data):
    data = simulation_data
    assert data['participation_prob'].between(0, 1).all(), "Participation probabilities out of range"

# Checking correlation between predictor variables
def test_correlation_between_variables(simulation_data):
    data = simulation_data
    correlation = data[['frontline_prev_best', 'lead_prev_best']].corr().iloc[0, 1]
    assert correlation > 0.5, "Correlation between frontline_prev_best and lead_prev_best is weak"

# Validating participation logic
def test_participation_logic(simulation_data):
    data = simulation_data
    # Checking if higher predictors generally lead to higher participation
    high_prob = data[data['participation_prob'] > 0.8]
    assert high_prob['participation'].mean() > 0.8, "Participation logic inconsistency for high probabilities"

# Checking for missing values
def test_missing_values(simulation_data):
    data = simulation_data
    assert not data.isnull().values.any(), "Dataset contains missing values"


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
