In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def extract_conditions(pmh_string):
    if pd.isna(pmh_string) or pmh_string == '0':
        return []
    # Split by comma and handle special cases
    conditions = [c.strip() for c in str(pmh_string).split(',')]
    return conditions

def preprocess_medical_data(excel_path):
    try:
        df = pd.read_excel(excel_path, engine='openpyxl')
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return None
    
    df = df.copy()
    
    # 1. Define column types
    categorical_columns = ['gender', 'CC', 'Dx']
    binary_columns = ['DM', 'HTN', 'Smoking', 'addiction', 'leukocytosis', 
                     'COVID-19 PCR', 'Sepsis']
    
    # 2. Convert binary columns to int (0 and 1)
    for col in binary_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        df[col] = df[col].astype(int)
    
    # 3. Handle Other_PMH separately
    # Get all unique medical conditions
    all_conditions = set()
    for conditions in df['Other_PMH'].apply(extract_conditions):
        all_conditions.update(conditions)
    
    # Remove '0' if it's in the set
    if '0' in all_conditions:
        all_conditions.remove('0')
    
    # Create binary columns for each condition
    for condition in all_conditions:
        col_name = f'PMH_{condition}'
        df[col_name] = df['Other_PMH'].apply(
            lambda x: 1 if condition in extract_conditions(x) else 0
        )
    
    # Drop original Other_PMH column
    df.drop('Other_PMH', axis=1, inplace=True)
    
    # 4. Process other categorical columns
    for col in categorical_columns:
        # Handle missing values
        df[col] = df[col].fillna('None')
        df[col] = df[col].replace('0', 'None')
        
        # Create dummies with actual value names
        dummies = pd.get_dummies(df[col], prefix=col, prefix_sep='_')
        dummies = dummies.astype(int)
        
        # Add the dummies to the dataframe
        df = pd.concat([df, dummies], axis=1)
        
        # Drop original column
        df.drop(col, axis=1, inplace=True)
    
    # 5. Handle the Outcome column
    if 'Outcome' in df.columns:
        df['Outcome'] = pd.to_numeric(df['Outcome'], errors='coerce').fillna(0).astype(int)
    
    return df

def main():
    excel_path = '/home/aricept094/mydata/mmH (4).xlsx'
    processed_df = preprocess_medical_data(excel_path)
    
    if processed_df is not None:
        # Save processed data
        processed_df.to_excel('/home/aricept094/mydata/processed_medical_data.xlsx', index=True, engine='openpyxl')
        
        # Print information
        print("\nProcessed dataset shape:", processed_df.shape)
        print("\nMedical condition columns:")
        pmh_cols = [col for col in processed_df.columns if col.startswith('PMH_')]
        for col in pmh_cols:
            print(col)
        
        print("\nSample of processed data:\n", processed_df.head())

if __name__ == "__main__":
    main()


Processed dataset shape: (302, 101)

Medical condition columns:
PMH_HF
PMH_آسم
PMH_آرتریت گرانولوماتوز
PMH_دمانس
PMH_HCV
PMH_BPH
PMH_اسپلنکتومی
PMH_TB
PMH_COPD
PMH_آبسه کبدی
PMH_RA
PMH_پارکینسون
PMH_ESRD
PMH_CVA
PMH_CANCER
PMH_HTLV1
PMH_IHD
PMH_CKD
PMH_HYPOT
PMH_DLP

Sample of processed data:
    index  age  DM  HTN  Smoking  addiction  leukocytosis  COVID-19 PCR  \
0      1   84   0    0        0          0             0             1   
1      2   72   0    1        0          0             1             1   
2      3   84   0    0        0          0             1             0   
3      4   88   0    1        1          0             0             0   
4      5   73   0    0        0          0             0             0   

   Outcome  Admission Duration  ...  Dx_پاروتیدیت  Dx_پنومونی  Dx_پیلونفریت  \
0        0                  10  ...             0           0             0   
1        1                   7  ...             0           0             0   
2        1            

In [7]:
import pandas as pd

# Data
data = {
    "Medical Condition": [
        "IHD (Ischemic Heart Disease)", "HF (Heart Failure)", "COPD (Chronic Obstructive Pulmonary Disease)", 
        "CVA (Cerebrovascular Accident)", "Dementia", "Cancer", "BPH (Benign Prostatic Hyperplasia)", 
        "ESRD (End-Stage Renal Disease)", "DLP (Dyslipidemia)", "CKD (Chronic Kidney Disease)", "TB (Tuberculosis)", 
        "Parkinson's Disease", "Hypothyroidism", "RA (Rheumatoid Arthritis)", "Asthma", 
        "Granulomatous Arthritis", "HCV (Hepatitis C Virus)", "Splenectomy", "Liver Abscess", 
        "HTLV-1 (Human T-lymphotropic virus type 1)"
    ],
    "Frequency (n)": [49, 20, 18, 16, 14, 13, 12, 10, 10, 9, 6, 4, 4, 3, 1, 1, 1, 1, 1, 1],
    "Prevalence (%)": [16.2, 6.6, 6.0, 5.3, 4.6, 4.3, 4.0, 3.3, 3.3, 3.0, 2.0, 1.3, 1.3, 1.0, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to Excel
file_path = "/home/aricept094/mydata/Medical_Conditions.xlsx"
df.to_excel(file_path, index=False)

In [8]:
# Data for Chief Complaints
chief_complaints_data = {
    "Chief Complaint": [
        "Fever (تب)", "Dyspnea (تنگی نفس)", "Loss of Consciousness (LOC)", "Skin Lesions (ضایعات پوستی)", 
        "Weakness (ضعف)", "Limb Edema (تورم اندام)", "Cough (سرفه)", "Diarrhea (اسهال)", 
        "Urinary Symptoms (علائم ادراری)", "Abdominal Pain (درد شکم)", "Headache (سردرد)", 
        "Nausea/Vomiting (N/V)", "Chest Pain (درد قفسه سینه)", "Loss of Balance (عدم تعادل)", 
        "Knee Pain (درد زانو)", "Bloody Stool (مدفوع خونی)", "Sore Throat (گلودرد)", 
        "Otorrhea (اتوره)", "Limb Numbness (بی حسی اندام)", "Seizure (تشنج)", "Scrotum Pain (درد اسکورتوم)", 
        "Eye Pain (درد چشم)", "Back Pain (درد کمر)", "Neck Stiffness (سفتی گردن)", 
        "Wound Infection (عفونت زخم)", "Neuromuscular Symptoms (علائم نوروموسکولار)", 
        "Hemoptysis (هموپتزی)"
    ],
    "Frequency (n)": [
        83, 61, 32, 27, 18, 17, 12, 12, 7, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
    ],
    "Prevalence (%)": [
        27.5, 20.2, 10.6, 8.9, 6.0, 5.6, 4.0, 4.0, 2.3, 1.3, 1.0, 1.0, 1.0, 1.0, 0.7, 0.7, 0.7, 0.3, 0.3, 
        0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3
    ]
}

# Create DataFrame
chief_complaints_df = pd.DataFrame(chief_complaints_data)

# Save to Excel
chief_complaints_file_path = "/home/aricept094/mydata/Chief_Complaints.xlsx"
chief_complaints_df.to_excel(chief_complaints_file_path, index=False)

chief_complaints_file_path


'/home/aricept094/mydata/Chief_Complaints.xlsx'

In [6]:
import pandas as pd
from tabulate import tabulate

# Create the data
data = {
    'Chief Complaint': [
        'Neuromuscular symptoms', 'Seizure', 'Limb numbness', 'Otorrhea',
        'Back pain', 'Wound infection', 'Abdominal pain', 'Eye pain',
        'Limb edema', 'Knee pain', 'Back pain (duplicate)', 'Diarrhea',
        'Headache', 'Hemoptysis', 'LOC', 'Skin lesions', 'Weakness',
        'Urinary symptoms', 'Chest pain', 'Cough', 'N/V', 'Dyspnea',
        'Sore throat', 'Neck stiffness', 'Scrotal pain', 'Imbalance',
        'Bloody stool', 'Fever'
    ],
    'β': [
        49.58, 29.57, 19.25, 18.13, 16.43, 13.61, 11.97, 8.32,
        8.22, 7.77, 5.04, 0.79, 0.70, -0.70, -1.17, -1.09, -1.49,
        -1.65, -2.25, -2.32, -2.56, -2.77, -5.56, -5.72, -5.76,
        -6.96, -7.41, -0.58
    ],
    'SE': [
        8.54, 8.85, 8.96, 9.16, 8.97, 9.18, 4.52, 9.00,
        2.20, 6.36, 9.02, 2.66, 5.26, 9.02, 1.68, 1.82, 2.19,
        3.44, 5.22, 2.66, 5.26, 1.29, 6.40, 9.01, 9.01,
        5.21, 6.37, 1.16
    ],
    'P_value': [
        1.677200e-08, 9.436152e-04, 3.237962e-02, 4.864550e-02,
        0.068, 0.139, 8.499408e-03, 0.356, 2.198791e-04, 0.223,
        0.577, 0.767, 0.893, 0.938, 0.488, 0.549, 0.499,
        0.633, 0.667, 0.385, 0.627, 3.282147e-02, 0.386, 0.526,
        0.523, 0.182, 0.245, 0.619
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Sort by absolute value of β coefficient
df['abs_beta'] = df['β'].abs()
df = df.sort_values('abs_beta', ascending=False)
df = df.drop('abs_beta', axis=1)

# Format p-values with significance markers
def format_pvalue(p):
    if p < 0.001:
        return f"{p:.3e}***"
    elif p < 0.01:
        return f"{p:.3f}**"
    elif p < 0.05:
        return f"{p:.3f}*"
    else:
        return f"{p:.3f}"

# Apply formatting
df['P_value'] = df['P_value'].apply(format_pvalue)
df['β'] = df['β'].round(2)
df['SE'] = df['SE'].round(2)

# Create table using tabulate
table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)

# Add table title and notes
title = "Table 1: Association between Chief Complaints and Admission Duration in Days (N=302)\n"
notes = "\nNote: Results from multiple linear regression models adjusted for age, gender, and outcome.\n"
notes += "β = regression coefficient (estimated change in admission duration in days)\n"
notes += "SE = standard error\n"
notes += "*p < 0.05, **p < 0.01, ***p < 0.001\n"
notes += "The table is ordered by coefficient magnitude (descending order)."

# Print the complete table
print(title + table + notes)

ModuleNotFoundError: No module named 'tabulate'

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import norm  # Fixed import
from typing import Dict, List, Union, Tuple

class DemographicAnalysis:
    """
    A class for managing and analyzing demographic data in clinical research.
    
    Attributes:
        data (pd.DataFrame): The main demographic data storage
        total_participants (int): Total number of study participants
    """
    
    def __init__(self):
        """Initialize the demographic analysis with structured data."""
        # Define the demographic data
        self.demographic_data = {
            'Age': {
                'Mean': 76.61,
                'SD': 8.25,
                'n': 302  # Total participants based on gender distribution
            },
            'Gender': {
                'Female': 131,
                'Male': 171
            },
            'Risk_Factors': {
                'Smoking': 13,
                'Substance_Addiction': 4
            }
        }
        
        # Calculate total participants
        self.total_participants = sum(self.demographic_data['Gender'].values())
        
        # Create structured DataFrame
        self._create_structured_dataframe()
    
    def _create_structured_dataframe(self) -> None:
        """
        Create a structured pandas DataFrame from the demographic data.
        """
        # Initialize lists for DataFrame construction
        characteristics = []
        details = []
        values = []
        percentages = []
        
        # Process Age data
        characteristics.append('Age (years)')
        details.append('Mean ± SD')
        values.append(f"{self.demographic_data['Age']['Mean']:.2f} ± {self.demographic_data['Age']['SD']:.2f}")
        percentages.append(None)
        
        # Process Gender data
        for gender, count in self.demographic_data['Gender'].items():
            characteristics.append('Gender')
            details.append(gender)
            values.append(count)
            percentages.append(self._calculate_percentage(count))
        
        # Process Risk Factors data
        for factor, count in self.demographic_data['Risk_Factors'].items():
            characteristics.append('Risk Factors')
            details.append(factor.replace('_', ' '))
            values.append(count)
            percentages.append(self._calculate_percentage(count))
        
        # Create DataFrame
        self.df = pd.DataFrame({
            'Characteristic': characteristics,
            'Detail': details,
            'Value': values,
            'Percentage': percentages
        })
    
    def _calculate_percentage(self, value: int) -> float:
        """
        Calculate percentage from given value.
        
        Args:
            value (int): Count to calculate percentage from
            
        Returns:
            float: Calculated percentage
        """
        return round((value / self.total_participants) * 100, 1)
    
    def display_table(self) -> pd.DataFrame:
        """
        Display the demographic table in a formatted manner.
        
        Returns:
            pd.DataFrame: Formatted demographic table
        """
        # Create display DataFrame with formatted values
        display_df = self.df.copy()
        
        # Format the display of values and percentages
        for idx, row in display_df.iterrows():
            if row['Percentage'] is not None:
                display_df.at[idx, 'Value'] = f"{row['Value']} ({row['Percentage']}%)"
        
        # Drop the percentage column for display
        display_df = display_df.drop('Percentage', axis=1)
        
        return display_df
    
    def get_summary_statistics(self) -> Dict[str, Dict[str, float]]:
        """
        Calculate summary statistics for numerical variables.
        
        Returns:
            Dict: Dictionary containing summary statistics
        """
        stats_dict = {
            'Age': {
                'Mean': self.demographic_data['Age']['Mean'],
                'SD': self.demographic_data['Age']['SD'],
                'SE': self.demographic_data['Age']['SD'] / np.sqrt(self.total_participants),
            }
        }
        
        # Calculate confidence interval
        se = stats_dict['Age']['SE']
        mean = stats_dict['Age']['Mean']
        ci_lower = mean - (1.96 * se)
        ci_upper = mean + (1.96 * se)
        stats_dict['Age']['95%_CI_Lower'] = ci_lower
        stats_dict['Age']['95%_CI_Upper'] = ci_upper
        
        return stats_dict

    def save_to_csv(self, filename: str = "demographic_data.csv") -> None:
        """
        Save the demographic data to a CSV file.
        
        Args:
            filename (str): Name of the CSV file to save (default: 'demographic_data.csv')
        """
        # Save main demographic table
        self.display_table().to_csv(filename, index=False)
        
        # Create and save summary statistics
        summary_stats = self.get_summary_statistics()
        summary_rows = []
        
        for variable, stats in summary_stats.items():
            for stat_name, value in stats.items():
                summary_rows.append({
                    'Variable': variable,
                    'Statistic': stat_name,
                    'Value': f"{value:.2f}"
                })
        
        summary_df = pd.DataFrame(summary_rows)
        
        # Save summary statistics to a separate CSV
        summary_filename = f"summary_statistics_{filename}"
        summary_df.to_csv(summary_filename, index=False)
        
        print(f"Demographic data saved to {filename}")
        print(f"Summary statistics saved to {summary_filename}")

if __name__ == "__main__":
    # Initialize analysis
    demographics = DemographicAnalysis()
    
    # Display formatted table
    print("\nDemographic Characteristics Table:")
    print(demographics.display_table().to_string(index=False))
    
    # Display summary statistics
    print("\nSummary Statistics:")
    summary_stats = demographics.get_summary_statistics()
    for variable, stats in summary_stats.items():
        print(f"\n{variable}:")
        for stat_name, value in stats.items():
            print(f"  {stat_name}: {value:.2f}")
    
    # Save data to CSV files
    demographics.save_to_csv()


Demographic Characteristics Table:
Characteristic              Detail               Value
   Age (years)           Mean ± SD 76.61 ± 8.25 (nan%)
        Gender              Female         131 (43.4%)
        Gender                Male         171 (56.6%)
  Risk Factors             Smoking           13 (4.3%)
  Risk Factors Substance Addiction            4 (1.3%)

Summary Statistics:

Age:
  Mean: 76.61
  SD: 8.25
  SE: 0.47
  95%_CI_Lower: 75.68
  95%_CI_Upper: 77.54
Demographic data saved to demographic_data.csv
Summary statistics saved to summary_statistics_demographic_data.csv


In [16]:
import pandas as pd
import statsmodels.api as sm

# Load your data from the Excel file
file_path = '/home/aricept094/mydata/processed_medical_data.xlsx'
data = pd.read_excel(file_path)

# List of chief complaints (CC)
cc_list = [col for col in data.columns if col.startswith('CC_')]

# Prepare results DataFrame
results = pd.DataFrame(columns=['Chief Complaint', 'Coefficient', 'P-value', 'Significant'])

# Iterate through each chief complaint
for cc in cc_list:
    # Define the independent variables (including age, gender, and the specific CC)
    X = data[['age', 'gender_F', 'gender_M', cc]]
    # Add a constant to the independent variables matrix
    X = sm.add_constant(X)
    # Define the dependent variable (AdmissionDuration)
    y = data['AdmissionDuration']

    # Create and fit the linear regression model
    model = sm.OLS(y, X).fit()

    # Extract the coefficient, p-value for the CC
    coefficient = model.params[cc]
    p_value = model.pvalues[cc]

    # Determine if the relationship is statistically significant (e.g., p-value < 0.05)
    significant = 'Yes' if p_value < 0.05 else 'No'

    # Add the results to the DataFrame
    results = pd.concat([results, pd.DataFrame({'Chief Complaint': [cc],
                                               'Coefficient': [coefficient],
                                               'P-value': [p_value],
                                               'Significant': [significant]})], ignore_index=True)

# Print the results
print(results)

# Print only significant results:
print("\nSignificant relationships:")
print(results[results['Significant'] == 'Yes'])

# Optional: Save results to a CSV
results.to_csv('cc_admission_duration_regression_results.csv', index=False)

          Chief Complaint  Coefficient       P-value Significant
0                  CC_LOC    -1.103438  5.128266e-01          No
1                  CC_N/V    -2.755805  5.980953e-01          No
2                CC_اتوره    20.226746  2.444284e-02         Yes
3                CC_اسهال     1.079437  6.840380e-01          No
4         CC_بی حسی اندام    18.980684  3.499072e-02         Yes
5                   CC_تب    -0.677610  5.597510e-01          No
6                 CC_تشنج    29.356517  1.033701e-03         Yes
7             CC_تنگی نفس    -2.724236  3.561209e-02         Yes
8           CC_تورم اندام     8.359645  1.709544e-04         Yes
9         CC_درد اسکورتوم    -5.957959  5.091513e-01          No
10            CC_درد زانو     7.531583  2.377100e-01          No
11             CC_درد شکم    11.669140  1.025525e-02         Yes
12       CC_درد قفسه سینه    -2.485799  6.344543e-01          No
13             CC_درد چشم     8.122935  3.679547e-01          No
14             CC_درد کمر

  results = pd.concat([results, pd.DataFrame({'Chief Complaint': [cc],


In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

# Read the Excel file
df = pd.read_excel('/home/aricept094/mydata/processed_medical_data.xlsx')

# Convert numeric columns to appropriate data types
numeric_columns = ['age', 'AdmissionDuration']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Convert binary columns to numeric (0/1)
binary_columns = [col for col in df.columns if col.startswith(('gender_', 'CC_', 'PMH_'))]
for col in binary_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert Outcome to numeric
df['Outcome'] = pd.to_numeric(df['Outcome'], errors='coerce')

# Function to run adjusted regression for each CC
def analyze_cc_duration(df, cc_column):
    try:
        # Remove rows with NaN values
        analysis_df = df[[cc_column, 'age', 'gender_F', 'Outcome', 'AdmissionDuration']].dropna()
        
        # Create dummy variables for outcome (0 as reference)
        outcome_dummies = pd.get_dummies(analysis_df['Outcome'], prefix='Outcome', drop_first=True)
        
        # Prepare the data for regression
        X = pd.DataFrame({
            'Age': analysis_df['age'],
            'Gender_F': analysis_df['gender_F'],
            'Outcome_1': (analysis_df['Outcome'] == 1).astype(int),
            'Outcome_2': (analysis_df['Outcome'] == 2).astype(int),
            'CC': analysis_df[cc_column]
        })
        
        # Add constant for regression
        X = sm.add_constant(X)
        
        # Dependent variable
        y = analysis_df['AdmissionDuration']
        
        # Fit the model
        model = sm.OLS(y, X).fit()
        
        # Extract CC coefficient and p-value
        cc_coef = model.params['CC']
        cc_pval = model.pvalues['CC']
        cc_std_err = model.bse['CC']
        
        return {
            'CC': cc_column,
            'Coefficient': cc_coef,
            'Std_Error': cc_std_err,
            'P_value': cc_pval,
            'Significant': cc_pval < 0.05,
            'N': len(y),
            'R_squared': model.rsquared
        }
    except Exception as e:
        print(f"Error analyzing {cc_column}: {str(e)}")
        return None

# Get list of CC columns
cc_columns = [col for col in df.columns if col.startswith('CC_')]

# Analyze each CC
results = []
for cc in cc_columns:
    result = analyze_cc_duration(df, cc)
    if result is not None:
        results.append(result)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort by p-value
results_df = results_df.sort_values('P_value')

# Print significant results
significant_results = results_df[results_df['Significant']]
print("\nStatistically Significant Chief Complaints (adjusted for age, gender, and outcome):")
print("================================================================")
print(significant_results.to_string(index=False))

# Optional: Add effect size interpretation
def interpret_effect_size(coef, std):
    if abs(coef) < std:
        return "Small"
    elif abs(coef) < 2 * std:
        return "Medium"
    else:
        return "Large"

significant_results['Effect_Size'] = significant_results.apply(
    lambda x: interpret_effect_size(x['Coefficient'], x['Std_Error']), axis=1
)

# Print detailed results
print("\nDetailed Results for Significant Associations:")
print("================================================================")
for _, row in significant_results.iterrows():
    print(f"\nChief Complaint: {row['CC']}")
    print(f"Coefficient: {row['Coefficient']:.3f} days")
    print(f"Standard Error: {row['Std_Error']:.3f}")
    print(f"P-value: {row['P_value']:.4f}")
    print(f"Effect Size: {row['Effect_Size']}")
    print(f"Sample Size: {row['N']}")
    print(f"R-squared: {row['R_squared']:.3f}")
    print("-" * 50)

# Save results
results_df.to_excel('cc_regression_results.xlsx', index=False)

# Print summary statistics
print("\nAnalysis Summary:")
print("================================================================")
print(f"Total CC analyzed: {len(cc_columns)}")
print(f"Number of significant CC: {len(significant_results)}")
print(f"Average R-squared for significant models: {significant_results['R_squared'].mean():.3f}")

non_significant_results = results_df[~results_df['Significant']]
print("\nNon-Significant Chief Complaints (adjusted for age, gender, and outcome):")
print("================================================================")
print(non_significant_results.to_string(index=False))

# Add detailed results for non-significant findings
print("\nDetailed Results for Non-Significant Associations:")
print("================================================================")
for _, row in non_significant_results.iterrows():
    print(f"\nChief Complaint: {row['CC']}")
    print(f"Coefficient: {row['Coefficient']:.3f} days")
    print(f"Standard Error: {row['Std_Error']:.3f}")
    print(f"P-value: {row['P_value']:.4f}")
    print(f"Sample Size: {row['N']}")
    print(f"R-squared: {row['R_squared']:.3f}")
    print("-" * 50)


Statistically Significant Chief Complaints (adjusted for age, gender, and outcome):
                   CC  Coefficient  Std_Error      P_value  Significant   N  R_squared
CC_علائم نوروموسکولار    49.582585   8.544791 1.677200e-08         True 302   0.111005
        CC_تورم اندام     8.224502   2.198330 2.198791e-04         True 302   0.054585
              CC_تشنج    29.567400   8.851294 9.436152e-04         True 302   0.045849
           CC_درد شکم    11.966741   4.516971 8.499408e-03         True 302   0.032813
      CC_بی حسی اندام    19.253611   8.955919 3.237962e-02         True 302   0.025101
          CC_تنگی نفس    -2.766752   1.290270 3.282147e-02         True 302   0.025024
             CC_اتوره    18.126739   9.155576 4.864550e-02         True 302   0.022819

Detailed Results for Significant Associations:

Chief Complaint: CC_علائم نوروموسکولار
Coefficient: 49.583 days
Standard Error: 8.545
P-value: 0.0000
Effect Size: Large
Sample Size: 302
R-squared: 0.111
---------------

In [25]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Read the Excel file
df = pd.read_excel('/home/aricept094/mydata/processed_medical_data.xlsx')

# Convert numeric columns to appropriate data types
numeric_columns = ['age']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# First, let's check what columns actually exist in the dataframe
existing_columns = df.columns.tolist()
print("Available columns in the dataset:")
print([col for col in existing_columns if col.startswith('Dx_')])

# Convert binary columns to numeric (0/1)
binary_columns = [col for col in df.columns if col.startswith(('gender_', 'Dx_'))]
for col in binary_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert Outcome to numeric
df['Outcome'] = pd.to_numeric(df['Outcome'], errors='coerce')

def safe_exp(x):
    """Safely compute exponential with clipping to prevent overflow"""
    try:
        return np.exp(np.clip(x, -20, 20))
    except:
        return np.nan

def analyze_dx_outcome(df, dx_column):
    try:
        # Remove rows with NaN values
        analysis_df = df[[dx_column, 'age', 'gender_F', 'Outcome']].dropna()
        
        # Check if we have enough data for each outcome level
        min_samples = 5
        for outcome in [0, 1, 2]:
            if (analysis_df['Outcome'] == outcome).sum() < min_samples:
                print(f"Warning: Insufficient data for outcome {outcome} in {dx_column}")
                return None
        
        results = {}
        
        # Run separate logistic regression for each non-reference outcome level
        for outcome in [1, 2]:
            y = (analysis_df['Outcome'] == outcome).astype(int)
            ref_outcome = (analysis_df['Outcome'] == 0).astype(int)
            
            # Only include cases with outcome 0 (reference) or current outcome
            mask = (analysis_df['Outcome'].isin([0, outcome]))
            subset_df = analysis_df[mask]
            
            X = pd.DataFrame({
                'Age': subset_df['age'],
                'Gender_F': subset_df['gender_F'],
                'DX': subset_df[dx_column]
            })
            X = sm.add_constant(X)
            y_subset = y[mask]
            
            try:
                # Try fitting with different optimization methods
                for method in ['newton', 'bfgs', 'lbfgs']:
                    try:
                        model = sm.Logit(y_subset, X).fit(method=method, maxiter=200)
                        break
                    except:
                        continue
                else:
                    print(f"Warning: All optimization methods failed for {dx_column}, outcome {outcome}")
                    continue
                
                dx_coef = model.params['DX']
                dx_pval = model.pvalues['DX']
                dx_std_err = model.bse['DX']
                
                # Safely calculate odds ratios and CIs
                odds_ratio = safe_exp(dx_coef)
                ci_lower = safe_exp(dx_coef - 1.96 * dx_std_err)
                ci_upper = safe_exp(dx_coef + 1.96 * dx_std_err)
                
                # Check for extreme values
                if odds_ratio > 1000:
                    print(f"Warning: Very large odds ratio for {dx_column}, outcome {outcome}")
                
                results[f'outcome_{outcome}'] = {
                    'DX': dx_column,
                    'Outcome': outcome,
                    'Coefficient': dx_coef,
                    'Odds_Ratio': odds_ratio,
                    'CI_Lower': ci_lower,
                    'CI_Upper': ci_upper,
                    'Std_Error': dx_std_err,
                    'P_value': dx_pval,
                    'Significant': dx_pval < 0.05,
                    'N': len(y_subset),
                    'N_case': y_subset.sum(),
                    'N_control': (1 - y_subset).sum(),
                    'Pseudo_R2': model.prsquared
                }
                
            except Exception as e:
                print(f"Error in model fitting for {dx_column}, outcome {outcome}: {str(e)}")
                continue
        
        return results if results else None
        
    except Exception as e:
        print(f"Error analyzing {dx_column}: {str(e)}")
        return None

# Get list of diagnosis columns
dx_columns = [col for col in df.columns if col.startswith('Dx_')]
print(f"\nFound {len(dx_columns)} diagnosis columns in the dataset")

# Analyze each diagnosis
all_results = []
for dx in dx_columns:
    results = analyze_dx_outcome(df, dx)
    if results is not None:
        for outcome_level in results:
            all_results.append(results[outcome_level])

# Check if we have any results
if not all_results:
    print("\nNo valid results were obtained. Please check the data structure and column names.")
else:
    # Convert results to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Sort by diagnosis and p-value
    results_df = results_df.sort_values(['DX', 'P_value'])
    
    # Print significant results
    significant_results = results_df[results_df['Significant']].copy()
    print("\nStatistically Significant Associations (adjusted for age and gender):")
    print("================================================================")
    if len(significant_results) > 0:
        display_cols = ['DX', 'Outcome', 'Odds_Ratio', 'CI_Lower', 'CI_Upper', 'P_value', 'N_case', 'N_control']
        print(significant_results[display_cols].to_string(index=False))
    else:
        print("No significant associations found")

    # Add effect size interpretation
    def interpret_effect_size(odds_ratio):
        try:
            or_val = float(odds_ratio)
            if 0.5 <= or_val <= 2:
                return "Small"
            elif 0.2 <= or_val < 0.5 or 2 < or_val <= 5:
                return "Medium"
            else:
                return "Large"
        except:
            return "Unable to determine"

    if len(significant_results) > 0:
        significant_results.loc[:, 'Effect_Size'] = significant_results['Odds_Ratio'].apply(interpret_effect_size)

        # Print detailed results
        print("\nDetailed Results for Significant Associations:")
        print("================================================================")
        for _, row in significant_results.iterrows():
            print(f"\nDiagnosis: {row['DX']}")
            print(f"Outcome Level: {row['Outcome']} vs 0 (reference)")
            print(f"Sample Size: {row['N']} (Cases: {row['N_case']}, Controls: {row['N_control']})")
            print(f"Odds Ratio: {row['Odds_Ratio']:.3f} (95% CI: {row['CI_Lower']:.3f}-{row['CI_Upper']:.3f})")
            print(f"P-value: {row['P_value']:.4f}")
            print(f"Effect Size: {row['Effect_Size']}")
            print(f"Pseudo R-squared: {row['Pseudo_R2']:.3f}")
            print("-" * 50)

    # Save detailed results
    results_df.to_excel('dx_regression_results.xlsx', index=False)

    # Print summary statistics
    print("\nAnalysis Summary:")
    print("================================================================")
    print(f"Total diagnoses analyzed: {len(dx_columns)}")
    print(f"Number of significant associations: {len(significant_results)}")
    if len(significant_results) > 0:
        print(f"Average Pseudo R-squared for significant models: {significant_results['Pseudo_R2'].mean():.3f}")

    # Summary of non-significant results
    non_significant_results = results_df[~results_df['Significant']]
    if len(non_significant_results) > 0:
        print("\nNon-Significant Associations Summary:")
        print("================================================================")
        display_cols = ['DX', 'Outcome', 'Odds_Ratio', 'CI_Lower', 'CI_Upper', 'P_value', 'N_case', 'N_control']
        print(non_significant_results[display_cols].to_string(index=False))

Available columns in the dataset:
['Dx_CCHF', 'Dx_FUO', 'Dx_HAV', 'Dx_Inf BS', 'Dx_Inf DF', 'Dx_ME OTITIS', 'Dx_N', 'Dx_SSI', 'Dx_TB', 'Dx_UTI', 'Dx_آبسه شکمی', 'Dx_آبسه پلویک', 'Dx_آبسه کبدی', 'Dx_آرتریت سپتیک', 'Dx_آنسفالیت', 'Dx_آنفلونزا', 'Dx_اسپوندیلیت', 'Dx_اندوکاردیت', 'Dx_اپیدیدمواورکایتیس', 'Dx_باکتریمی', 'Dx_بروسلوز', 'Dx_دیسانتری آمیبی', 'Dx_زونا', 'Dx_سلولیت', 'Dx_فارنژیت استرپتوکوکی', 'Dx_فاشئیت', 'Dx_فلبیت', 'Dx_مننژیت', 'Dx_مننگوانسفالیت', 'Dx_موکور', 'Dx_پاروتیدیت', 'Dx_پنومونی', 'Dx_پیلونفریت', 'Dx_کزاز', 'Dx_کله سیستیت', 'Dx_کوله سیستیت', 'Dx_کولیت عفونی', 'Dx_کووید', 'Dx_کیست هیداتید', 'Dx_گاستروانتریت']

Found 40 diagnosis columns in the dataset
Optimization terminated successfully.
         Current function value: 0.218223
         Iterations 53
         Current function value: 0.270103
         Iterations: 200
         Current function value: 0.217852
         Iterations: 200
Optimization terminated successfully.
         Current function value: 0.269609
         

In [20]:

import pandas as pd
import numpy as np

def analyze_medical_outcomes(file_path):
    try:
        # Read the Excel file with correct encoding
        df = pd.read_excel(file_path)
        
        # Create a dictionary to store the results
        results = []
        total_all_cases = 0
        total_complete_treatment = 0
        total_dama = 0
        total_deaths = 0
        
        # List of all conditions
        conditions = [col for col in df.columns if col.startswith('Dx_')]
        
        # Calculate prevalence for each condition
        for condition in conditions:
            condition_cases = df[df[condition] == 1]
            
            if len(condition_cases) > 0:
                outcome_counts = condition_cases['Outcome'].value_counts()
                
                total_cases = len(condition_cases)
                complete_treatment = outcome_counts.get(0, 0)
                dama = outcome_counts.get(1, 0)
                death = outcome_counts.get(2, 0)
                
                # Add to totals
                total_all_cases += total_cases
                total_complete_treatment += complete_treatment
                total_dama += dama
                total_deaths += death
                
                # Calculate percentages
                complete_treatment_pct = round((complete_treatment / total_cases * 100), 1)
                dama_pct = round((dama / total_cases * 100), 1)
                mortality_pct = round((death / total_cases * 100), 1)
                
                results.append({
                    'Disease': condition,
                    'Total Cases': total_cases,
                    'Complete Treatment': f"{complete_treatment} ({complete_treatment_pct}%)",
                    'DAMA': f"{dama} ({dama_pct}%)",
                    'Mortality': f"{death} ({mortality_pct}%)"
                })
        
        # Add total row
        total_complete_treatment_pct = round((total_complete_treatment / total_all_cases * 100), 1)
        total_dama_pct = round((total_dama / total_all_cases * 100), 1)
        total_mortality_pct = round((total_deaths / total_all_cases * 100), 1)
        
        results.append({
            'Disease': 'Total',
            'Total Cases': total_all_cases,
            'Complete Treatment': f"{total_complete_treatment} ({total_complete_treatment_pct}%)",
            'DAMA': f"{total_dama} ({total_dama_pct}%)",
            'Mortality': f"{total_deaths} ({total_mortality_pct}%)"
        })
        
        # Create a DataFrame from results
        results_df = pd.DataFrame(results)
        
        # Sort by total cases in descending order, keeping Total row at the bottom
        results_df = pd.concat([
            results_df[results_df['Disease'] != 'Total'].sort_values('Total Cases', ascending=False),
            results_df[results_df['Disease'] == 'Total']
        ]).reset_index(drop=True)
        
        # Save to CSV with correct encoding
        output_file = 'clinical_outcomes_analysis.csv'
        
        # Add metadata at the top of the file
        with open(output_file, 'w', encoding='utf-8-sig') as f:
            f.write("Clinical Outcomes Analysis of Infectious Diseases\n")
            f.write("Data Collection Period: [Insert Period]\n")
            f.write("Institution: [Insert Institution Name]\n")
            f.write("\nNote: DAMA = Discharge Against Medical Advice\n")
            f.write("All percentages are rounded to one decimal place\n\n")
            
            # Write the DataFrame
            results_df.to_csv(f, index=False, encoding='utf-8-sig')
        
        print(f"Analysis completed and saved to {output_file}")
        return results_df
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Run the analysis
file_path = '/home/aricept094/mydata/processed_medical_data.xlsx'
results = analyze_medical_outcomes(file_path)

# Display the results if successful
if results is not None:
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    print("\nClinical Outcomes Analysis:")
    print(results)


Analysis completed and saved to clinical_outcomes_analysis.csv

Clinical Outcomes Analysis:
                   Disease  Total Cases Complete Treatment       DAMA  \
0               Dx_پنومونی           97         80 (82.5%)   5 (5.2%)   
1                   Dx_UTI           28         26 (92.9%)   1 (3.6%)   
2                 Dx_کووید           27         25 (92.6%)   2 (7.4%)   
3                    Dx_TB           22         17 (77.3%)  4 (18.2%)   
4                Dx_سلولیت           15         13 (86.7%)   1 (6.7%)   
5             Dx_پیلونفریت           14         13 (92.9%)   0 (0.0%)   
6                Dx_Inf DF           10          7 (70.0%)  1 (10.0%)   
7          Dx_آرتریت سپتیک            9         9 (100.0%)   0 (0.0%)   
8          Dx_گاستروانتریت            8         8 (100.0%)   0 (0.0%)   
9                  Dx_زونا            8         8 (100.0%)   0 (0.0%)   
10               Dx_مننژیت            5         5 (100.0%)   0 (0.0%)   
11               Dx_Inf BS      

In [22]:
import pandas as pd
import numpy as np

def analyze_sepsis_impact(file_path):
    try:
        df = pd.read_excel(file_path)
        
        # Find all conditions that ever overlap with sepsis
        sepsis_cases = df[df['Sepsis'] == 1]
        condition_columns = [col for col in df.columns if col.startswith('Dx_')]
        overlapping_conditions = []
        
        for col in condition_columns:
            if any(sepsis_cases[col] == 1):
                overlapping_conditions.append(col)
        
        # Create results structure
        results = []
        
        # Analyze each overlapping condition
        for condition in overlapping_conditions:
            # Cases with this condition but WITHOUT sepsis
            without_sepsis = df[(df[condition] == 1) & (df['Sepsis'] == 0)]
            # Cases with both this condition AND sepsis
            with_sepsis = df[(df[condition] == 1) & (df['Sepsis'] == 1)]
            
            # Calculate outcomes for cases without sepsis
            total_without = len(without_sepsis)
            if total_without > 0:
                complete_without = len(without_sepsis[without_sepsis['Outcome'] == 0])
                dama_without = len(without_sepsis[without_sepsis['Outcome'] == 1])
                death_without = len(without_sepsis[without_sepsis['Outcome'] == 2])
                
                complete_without_pct = round((complete_without/total_without * 100), 1)
                dama_without_pct = round((dama_without/total_without * 100), 1)
                death_without_pct = round((death_without/total_without * 100), 1)
            
            # Calculate outcomes for cases with sepsis
            total_with = len(with_sepsis)
            if total_with > 0:
                complete_with = len(with_sepsis[with_sepsis['Outcome'] == 0])
                dama_with = len(with_sepsis[with_sepsis['Outcome'] == 1])
                death_with = len(with_sepsis[with_sepsis['Outcome'] == 2])
                
                complete_with_pct = round((complete_with/total_with * 100), 1)
                dama_with_pct = round((dama_with/total_with * 100), 1)
                death_with_pct = round((death_with/total_with * 100), 1)
            
            results.append({
                'Condition': condition,
                'Without Sepsis Total': total_without,
                'Without Sepsis Complete': f"{complete_without} ({complete_without_pct}%)" if total_without > 0 else "0 (0%)",
                'Without Sepsis DAMA': f"{dama_without} ({dama_without_pct}%)" if total_without > 0 else "0 (0%)",
                'Without Sepsis Death': f"{death_without} ({death_without_pct}%)" if total_without > 0 else "0 (0%)",
                'With Sepsis Total': total_with,
                'With Sepsis Complete': f"{complete_with} ({complete_with_pct}%)" if total_with > 0 else "0 (0%)",
                'With Sepsis DAMA': f"{dama_with} ({dama_with_pct}%)" if total_with > 0 else "0 (0%)",
                'With Sepsis Death': f"{death_with} ({death_with_pct}%)" if total_with > 0 else "0 (0%)"
            })
        
        # Create DataFrame
        results_df = pd.DataFrame(results)
        
        # Save to CSV
        output_file = 'sepsis_impact_analysis.csv'
        
        with open(output_file, 'w', encoding='utf-8-sig') as f:
            f.write("Analysis of Sepsis Impact on Disease Outcomes\n")
            f.write("Data Collection Period: [Insert Period]\n")
            f.write("Institution: [Insert Institution Name]\n\n")
            f.write("Note: DAMA = Discharge Against Medical Advice\n")
            f.write("All percentages are rounded to one decimal place\n\n")
            
            # Write the DataFrame
            results_df.to_csv(f, index=False, encoding='utf-8-sig')
        
        print(f"Analysis completed and saved to {output_file}")
        
        # Print summary
        print("\nSummary of findings:")
        for _, row in results_df.iterrows():
            print(f"\n{row['Condition']}:")
            print(f"Without Sepsis (n={row['Without Sepsis Total']}): {row['Without Sepsis Complete']} complete, "
                  f"{row['Without Sepsis DAMA']} DAMA, {row['Without Sepsis Death']} death")
            print(f"With Sepsis (n={row['With Sepsis Total']}): {row['With Sepsis Complete']} complete, "
                  f"{row['With Sepsis DAMA']} DAMA, {row['With Sepsis Death']} death")
        
        return results_df
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Run the analysis
file_path = '/home/aricept094/mydata/processed_medical_data.xlsx'
results = analyze_sepsis_impact(file_path)

Analysis completed and saved to sepsis_impact_analysis.csv

Summary of findings:

Dx_Inf BS:
Without Sepsis (n=2): 1 (50.0%) complete, 0 (0.0%) DAMA, 1 (50.0%) death
With Sepsis (n=2): 2 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death

Dx_UTI:
Without Sepsis (n=27): 25 (92.6%) complete, 1 (3.7%) DAMA, 1 (3.7%) death
With Sepsis (n=1): 1 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death

Dx_آبسه کبدی:
Without Sepsis (n=1): 1 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death
With Sepsis (n=1): 1 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death

Dx_باکتریمی:
Without Sepsis (n=3): 2 (66.7%) complete, 1 (33.3%) DAMA, 0 (0.0%) death
With Sepsis (n=1): 1 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death

Dx_فلبیت:
Without Sepsis (n=0): 0 (0%) complete, 0 (0%) DAMA, 0 (0%) death
With Sepsis (n=1): 1 (100.0%) complete, 0 (0.0%) DAMA, 0 (0.0%) death

Dx_پنومونی:
Without Sepsis (n=82): 70 (85.4%) complete, 3 (3.7%) DAMA, 9 (11.0%) death
With Sepsis (n=15): 10 (66.7%) complete, 2 (13.3%) DAMA, 3 (20