In [3]:
# ===== REAL AI SOLUTION FOR NOTEBOOK =====
# Copy this into your notebook for actual AI analysis

# ===== CELL 1: Import and Setup =====
import requests
import json
import pandas as pd
import time

print("🚀 Real AI Analysis Setup Complete!")

# ===== CELL 2: Load Any Dataset =====
def load_any_dataset(file_path):
    """Load any dataset (Excel, CSV, etc.)"""
    try:
        if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            df = pd.read_excel(file_path)
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            print("❌ Unsupported file format. Use Excel (.xlsx) or CSV (.csv)")
            return None
        
        print(f"✅ Loaded dataset with {len(df)} rows and {len(df.columns)} columns")
        print(f"📋 Columns: {list(df.columns)}")
        return df
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

# Load your dataset
file_path = r"D:\Python code\ollama\GBM_V1.11.xlsx"  # Change this to your file
df = load_any_dataset(file_path)

# ===== CELL 3: AI Analysis Function =====
def ask_ai_question(question, df, model="tinyllama:latest"):
    """Ask any question to AI about the dataset"""
    try:
        # Get comprehensive data summary
        summary = {
            "total_rows": len(df),
            "total_columns": len(df.columns),
            "column_names": list(df.columns),
            "data_types": df.dtypes.to_dict(),
            "missing_values": df.isnull().sum().to_dict(),
            "numeric_columns": df.select_dtypes(include=['number']).columns.tolist(),
            "categorical_columns": df.select_dtypes(include=['object']).columns.tolist(),
            "sample_data": df.head(3).to_dict('records')
        }
        
        # Create detailed prompt with actual data
        prompt = f"""
        I have a dataset with the following characteristics:
        
        DATASET INFO:
        - Total rows: {summary['total_rows']}
        - Total columns: {summary['total_columns']}
        - Column names: {summary['column_names']}
        - Data types: {summary['data_types']}
        - Missing values: {summary['missing_values']}
        - Numeric columns: {summary['numeric_columns']}
        - Categorical columns: {summary['categorical_columns']}
        
        SAMPLE DATA (first 3 rows):
        {summary['sample_data']}
        
        USER QUESTION: {question}
        
        Please analyze this dataset and provide insights based on the actual data. 
        Be specific and reference the actual column names and data.
        """
        
        # API call to model
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False,
            "options": {
                "num_ctx": 1024,
                "num_thread": 4,
                "temperature": 0.3
            }
        }
        
        print(f"🔍 Asking AI: {question}")
        start_time = time.time()
        
        response = requests.post(
            "http://localhost:11434/api/chat",
            json=payload,
            timeout=60
        )
        
        end_time = time.time()
        
        if response.status_code == 200:
            data = response.json()
            content = data.get("message", {}).get("content", "No response")
            print(f"📊 AI Response: {content}")
            print(f"⏱️ Time: {end_time - start_time:.1f} seconds")
            return content
        else:
            print(f"❌ Error: HTTP {response.status_code}")
            return None
            
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# ===== CELL 4: Test AI Analysis =====
# Test with a real question
if df is not None:
    question = "What are the main patterns and insights in this dataset?"
    print(f"\n🔍 Testing AI analysis: {question}")
    ask_ai_question(question, df)

# ===== CELL 5: Ready for Your Questions =====
print("\n🎉 Ready for AI Analysis!")
print("\n💡 You can now ask any question using:")
print("   ask_ai_question('Your question here', df)")
print("\nExample questions:")
print("- 'Is there a relationship between age and survival?'")
print("- 'What are the key factors affecting patient outcomes?'")
print("- 'Analyze the gender distribution and its implications'")
print("- 'What patterns do you see in the tumor size data?'")
print("- 'How do different surgery types relate to survival rates?'")

# ===== CELL 6: Ask Your Own Question =====
# Change this question to whatever you want to analyze
your_question = "Is there any missing values?"
print(f"\n🔍 Your question: {your_question}")
ask_ai_question(your_question, df)



print("\n✅ AI analysis complete!")

🚀 Real AI Analysis Setup Complete!
✅ Loaded dataset with 169 rows and 40 columns
📋 Columns: ['SL NO', 'DOCTOR_NAME', 'HOSPITAL_NUMBER', 'PATIENT_NAME', 'CONTACTNO', 'DEATH/ ALIVE (7 AUG 2024)', 'DEATH/ ALIVE (OUTSIDE RX DETAILS)', 'status', 'FOLLOW UP STATUS', 'SURGERY DATE', 'Study end time', 'Calculation Time', 'time_in_days', 'time_in_mos', 'TUMOR SIZE', 'SEX', 'AGE', 'LOCATION', 'LOCATION.1', 'LATARALITY', 'HPR DETAILS', 'HPR', 'GRADE', 'PRESS_DIAGNOSIS', 'SURGERY NAME', 'SURGERY TYPE', 'GFAP', 'IDH1/ IDH2', 'ATRX', 'MGMT', 'Mib Lebeling Index', 'RT DOSE', 'RT MODALITY', 'CCRT', 'RT COMPLETION DATE', 'ADJUVANT ChT', 'START DATE', 'END DATE', 'RECC STATUS', 'OS']

🔍 Testing AI analysis: What are the main patterns and insights in this dataset?
🔍 Asking AI: What are the main patterns and insights in this dataset?
📊 AI Response: The main patterns and insights in this dataset can be summarized as follows:

1. The dataset contains a total of 36 patients with different types of brain tumo

In [4]:
your_question = "Can you check for a regression analysis between age and survival?"
print(f"\n🔍 Your question: {your_question}")
ask_ai_question(your_question, df)


🔍 Your question: Can you check for a regression analysis between age and survival?
🔍 Asking AI: Can you check for a regression analysis between age and survival?
📊 AI Response: Yes, we can conduct a regression analysis to see if there is a correlation between age and survival in the dataset. To perform this analysis, we will use the "survminer" package from R, which provides built-in support for regression analysis.

Here's how we can set up the regression analysis:

1. Import the dataset into R using the "read.csv()" function and store it as a data frame called "data".

2. Split the data into training and testing sets using the "split" function from the "strsplit" package. We will use 70% of the data for training and the remaining 30% for testing.

3. Calculate the mean survival time for each age group in the training set.

4. Split the training set into age groups using the "cut" function from the "strsplit" package. We will use a cutoff of 65 years and divide the data into two grou

'Yes, we can conduct a regression analysis to see if there is a correlation between age and survival in the dataset. To perform this analysis, we will use the "survminer" package from R, which provides built-in support for regression analysis.\n\nHere\'s how we can set up the regression analysis:\n\n1. Import the dataset into R using the "read.csv()" function and store it as a data frame called "data".\n\n2. Split the data into training and testing sets using the "split" function from the "strsplit" package. We will use 70% of the data for training and the remaining 30% for testing.\n\n3. Calculate the mean survival time for each age group in the training set.\n\n4. Split the training set into age groups using the "cut" function from the "strsplit" package. We will use a cutoff of 65 years and divide the data into two groups: one with ages between 65-70 and another with ages between 71-80.\n\n5. Calculate the mean survival time for each age group in the training set, as well as the mea

In [5]:
your_question = "Can you do this?"
print(f"\n🔍 Your question: {your_question}")
ask_ai_question(your_question, df)


🔍 Your question: Can you do this?
🔍 Asking AI: Can you do this?
📊 AI Response: Sure, I can analyze this dataset and provide insights based on the actual column names and data. The dataset contains patient information, including demographic details, medical history, and treatment information. Some of the columns include patient ID, age, sex, disease type (e.g., astrocytoma, glioblastoma), tumor location (parieto-temporal or left parieto-occipital), grade (grade I-II), treatment type (surgery/radiation/chemotherapy), and follow-up date.

To analyze this dataset, we can first check the column names to ensure they are correct. The "ID" column is the primary key for each patient record, so it should be named ID instead of id or patient_id. We can also check if there are any duplicate values in the data by comparing the unique values with the list of all unique values.

Next, we can analyze the treatment information to see which treatments were given and how they were administered. For exam

'Sure, I can analyze this dataset and provide insights based on the actual column names and data. The dataset contains patient information, including demographic details, medical history, and treatment information. Some of the columns include patient ID, age, sex, disease type (e.g., astrocytoma, glioblastoma), tumor location (parieto-temporal or left parieto-occipital), grade (grade I-II), treatment type (surgery/radiation/chemotherapy), and follow-up date.\n\nTo analyze this dataset, we can first check the column names to ensure they are correct. The "ID" column is the primary key for each patient record, so it should be named ID instead of id or patient_id. We can also check if there are any duplicate values in the data by comparing the unique values with the list of all unique values.\n\nNext, we can analyze the treatment information to see which treatments were given and how they were administered. For example, we can look at the "Radiation" column and compare it to the correspond

In [6]:
your_question = "No. i meant can you do the regreesion analysis and share the result with me? If you can generate graphs and tables then it would be a bonus."
print(f"\n🔍 Your question: {your_question}")
ask_ai_question(your_question, df)


🔍 Your question: No. i meant can you do the regreesion analysis and share the result with me? If you can generate graphs and tables then it would be a bonus.
🔍 Asking AI: No. i meant can you do the regreesion analysis and share the result with me? If you can generate graphs and tables then it would be a bonus.
📊 AI Response: To perform a regression analysis in Python, you can use the `statsmodels` library. Here's an example:

```python
import pandas as pd
from statsmodels.formula import reg

# Load dataset
data = pd.read_csv('dataset.csv')
X = data[['age', 'sex', 'education', 'income']]
y = data['salary']

# Split into training and testing sets
train, test = X[:500], X[500:]

# Fit model using training set
model = reg(X=train.drop('salary', axis=1), y=train['salary'])
model.fit()

# Evaluate model on testing set
score = model.score(test, 'salary')
print(f'Regression score: {score}')
```

In this example, we loaded the `dataset.csv` dataset and split it into training and testing sets w

"To perform a regression analysis in Python, you can use the `statsmodels` library. Here's an example:\n\n```python\nimport pandas as pd\nfrom statsmodels.formula import reg\n\n# Load dataset\ndata = pd.read_csv('dataset.csv')\nX = data[['age', 'sex', 'education', 'income']]\ny = data['salary']\n\n# Split into training and testing sets\ntrain, test = X[:500], X[500:]\n\n# Fit model using training set\nmodel = reg(X=train.drop('salary', axis=1), y=train['salary'])\nmodel.fit()\n\n# Evaluate model on testing set\nscore = model.score(test, 'salary')\nprint(f'Regression score: {score}')\n```\n\nIn this example, we loaded the `dataset.csv` dataset and split it into training and testing sets with a ratio of 50% for each. We then fit the model using the training set and evaluate its performance on the testing set by computing the regression score. The result is printed to the console.\n\nTo generate graphs and tables based on the actual data, you can use the `pandas` library. Here's an exampl

In [7]:
your_question = "Do a regression analysis between age and survival and share the result with me"
print(f"\n🔍 Your question: {your_question}")
ask_ai_question(your_question, df)


🔍 Your question: Do a regression analysis between age and survival and share the result with me
🔍 Asking AI: Do a regression analysis between age and survival and share the result with me
📊 AI Response: To perform a regression analysis between age and survival using the given dataset, follow these steps:

1. Import the dataset into your preferred statistical software or program.
2. Examine the data by inspecting the first few rows to ensure that it meets the necessary conditions for a regression analysis.
3. Check if there are missing values in the dataset and remove them if necessary.
4. Split the dataset into training and testing sets using the 'train_test_split' function from scikit-learn or any other appropriate library.
5. Preprocess the data by removing outliers, scaling the variables to a standardized range, and normalizing the features.
6. Perform a regression analysis using statistical software or programming language.
7. Calculate the correlation coefficient between age and 

"To perform a regression analysis between age and survival using the given dataset, follow these steps:\n\n1. Import the dataset into your preferred statistical software or program.\n2. Examine the data by inspecting the first few rows to ensure that it meets the necessary conditions for a regression analysis.\n3. Check if there are missing values in the dataset and remove them if necessary.\n4. Split the dataset into training and testing sets using the 'train_test_split' function from scikit-learn or any other appropriate library.\n5. Preprocess the data by removing outliers, scaling the variables to a standardized range, and normalizing the features.\n6. Perform a regression analysis using statistical software or programming language.\n7. Calculate the correlation coefficient between age and survival and interpret its significance.\n8. Analyze the results of the regression analysis and draw conclusions based on the data.\n9. Present the results in a clear and concise manner, includin

In [8]:
import requests
import json
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

print("🚀 Enhanced EDA + Statistical Analysis Setup Complete!")

ModuleNotFoundError: No module named 'sklearn'

In [9]:
def load_any_dataset(file_path):
    """Load any dataset (Excel, CSV, etc.)"""
    try:
        if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            df = pd.read_excel(file_path)
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            print("❌ Unsupported file format. Use Excel (.xlsx) or CSV (.csv)")
            return None
        
        print(f"✅ Loaded dataset with {len(df)} rows and {len(df.columns)} columns")
        print(f"📋 Columns: {list(df.columns)}")
        return df
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

# Load your dataset
file_path = r"D:\Python code\ollama\GBM_V1.11.xlsx"
df = load_any_dataset(file_path)

✅ Loaded dataset with 169 rows and 40 columns
📋 Columns: ['SL NO', 'DOCTOR_NAME', 'HOSPITAL_NUMBER', 'PATIENT_NAME', 'CONTACTNO', 'DEATH/ ALIVE (7 AUG 2024)', 'DEATH/ ALIVE (OUTSIDE RX DETAILS)', 'status', 'FOLLOW UP STATUS', 'SURGERY DATE', 'Study end time', 'Calculation Time', 'time_in_days', 'time_in_mos', 'TUMOR SIZE', 'SEX', 'AGE', 'LOCATION', 'LOCATION.1', 'LATARALITY', 'HPR DETAILS', 'HPR', 'GRADE', 'PRESS_DIAGNOSIS', 'SURGERY NAME', 'SURGERY TYPE', 'GFAP', 'IDH1/ IDH2', 'ATRX', 'MGMT', 'Mib Lebeling Index', 'RT DOSE', 'RT MODALITY', 'CCRT', 'RT COMPLETION DATE', 'ADJUVANT ChT', 'START DATE', 'END DATE', 'RECC STATUS', 'OS']


In [16]:
def safe_json_serialize(obj):
    """Safely serialize objects to JSON, handling datetime and other non-serializable types"""
    if pd.isna(obj):
        return None
    elif isinstance(obj, (pd.Timestamp, pd.DatetimeTZDtype)):
        return str(obj)
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

def perform_simple_regression_analysis(df, x_col, y_col):
    """Perform simple regression analysis without scikit-learn"""
    try:
        # Clean data - remove missing values
        clean_df = df[[x_col, y_col]].dropna()
        
        if len(clean_df) < 10:
            return {"error": f"Insufficient data for regression. Only {len(clean_df)} complete cases."}
        
        # Convert to numeric if needed
        clean_df[x_col] = pd.to_numeric(clean_df[x_col], errors='coerce')
        clean_df[y_col] = pd.to_numeric(clean_df[y_col], errors='coerce')
        clean_df = clean_df.dropna()
        
        if len(clean_df) < 10:
            return {"error": f"Insufficient numeric data for regression. Only {len(clean_df)} complete cases."}
        
        # Calculate basic statistics
        x_values = clean_df[x_col].values
        y_values = clean_df[y_col].values
        
        # Calculate correlation
        correlation = np.corrcoef(x_values, y_values)[0, 1]
        
        # Calculate basic regression statistics
        n = len(x_values)
        x_mean = np.mean(x_values)
        y_mean = np.mean(y_values)
        
        # Calculate slope and intercept
        numerator = np.sum((x_values - x_mean) * (y_values - y_mean))
        denominator = np.sum((x_values - x_mean) ** 2)
        
        if denominator == 0:
            return {"error": "Cannot calculate regression: denominator is zero"}
        
        slope = numerator / denominator
        intercept = y_mean - slope * x_mean
        
        # Calculate R-squared
        y_pred = slope * x_values + intercept
        ss_res = np.sum((y_values - y_pred) ** 2)
        ss_tot = np.sum((y_values - y_mean) ** 2)
        r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
        
        # Calculate RMSE
        rmse = np.sqrt(np.mean((y_values - y_pred) ** 2))
        
        # Calculate descriptive statistics
        x_stats = {
            "mean": float(np.mean(x_values)),
            "std": float(np.std(x_values)),
            "min": float(np.min(x_values)),
            "max": float(np.max(x_values))
        }
        
        y_stats = {
            "mean": float(np.mean(y_values)),
            "std": float(np.std(y_values)),
            "min": float(np.min(y_values)),
            "max": float(np.max(y_values))
        }
        
        results = {
            "n_samples": len(clean_df),
            "x_variable": x_col,
            "y_variable": y_col,
            "x_statistics": x_stats,
            "y_statistics": y_stats,
            "regression_coefficient": float(slope),
            "intercept": float(intercept),
            "r_squared": float(r_squared),
            "rmse": float(rmse),
            "correlation_coefficient": float(correlation),
            "interpretation": get_simple_regression_interpretation(r_squared, correlation)
        }
        
        return results
        
    except Exception as e:
        return {"error": f"Regression analysis failed: {str(e)}"}

def get_simple_regression_interpretation(r2, correlation):
    """Get interpretation of regression results"""
    interpretations = []
    
    # R-squared interpretation
    if r2 >= 0.7:
        interpretations.append(f"Strong relationship (R² = {r2:.3f})")
    elif r2 >= 0.5:
        interpretations.append(f"Moderate relationship (R² = {r2:.3f})")
    elif r2 >= 0.3:
        interpretations.append(f"Weak relationship (R² = {r2:.3f})")
    else:
        interpretations.append(f"Very weak relationship (R² = {r2:.3f})")
    
    # Correlation interpretation
    if abs(correlation) >= 0.7:
        strength = "strong"
    elif abs(correlation) >= 0.5:
        strength = "moderate"
    elif abs(correlation) >= 0.3:
        strength = "weak"
    else:
        strength = "very weak"
    
    direction = "positive" if correlation > 0 else "negative"
    interpretations.append(f"{strength.capitalize()} {direction} correlation (r = {correlation:.3f})")
    
    return interpretations

def perform_comprehensive_eda_fixed(df):
    """Perform comprehensive EDA with proper JSON serialization"""
    print("🔍 Performing comprehensive EDA...")
    
    eda_results = {
        "dataset_overview": {},
        "numeric_analysis": {},
        "categorical_analysis": {},
        "correlation_analysis": {},
        "missing_data_analysis": {},
        "key_insights": []
    }
    
    # 1. Dataset Overview
    eda_results["dataset_overview"] = {
        "total_rows": len(df),
        "total_columns": len(df.columns),
        "column_names": list(df.columns),
        "data_types": {str(k): str(v) for k, v in df.dtypes.to_dict().items()},
        "memory_usage": float(df.memory_usage(deep=True).sum() / 1024 / 1024)
    }
    
    # 2. Missing Data Analysis
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / len(df)) * 100
    eda_results["missing_data_analysis"] = {
        "missing_counts": {str(k): int(v) for k, v in missing_data.to_dict().items()},
        "missing_percentages": {str(k): float(v) for k, v in missing_percentage.to_dict().items()},
        "columns_with_missing": [str(col) for col in missing_data[missing_data > 0].index.tolist()]
    }
    
    # 3. Numeric Analysis
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_columns:
        numeric_stats = {}
        for col in numeric_columns:
            clean_data = df[col].dropna()
            if len(clean_data) > 0:
                numeric_stats[str(col)] = {
                    "mean": float(clean_data.mean()),
                    "median": float(clean_data.median()),
                    "std": float(clean_data.std()),
                    "min": float(clean_data.min()),
                    "max": float(clean_data.max()),
                    "q25": float(clean_data.quantile(0.25)),
                    "q75": float(clean_data.quantile(0.75)),
                    "skewness": float(clean_data.skew()),
                    "missing_count": int(df[col].isnull().sum()),
                    "n_complete": len(clean_data)
                }
        eda_results["numeric_analysis"] = numeric_stats
    
    # 4. Categorical Analysis
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if categorical_columns:
        categorical_stats = {}
        for col in categorical_columns:
            value_counts = df[col].value_counts()
            categorical_stats[str(col)] = {
                "unique_values": int(df[col].nunique()),
                "most_common": str(value_counts.index[0]) if len(value_counts) > 0 else None,
                "most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
                "value_counts": {str(k): int(v) for k, v in value_counts.head(10).to_dict().items()},
                "missing_count": int(df[col].isnull().sum())
            }
        eda_results["categorical_analysis"] = categorical_stats
    
    # 5. Correlation Analysis
    if len(numeric_columns) > 1:
        correlation_matrix = df[numeric_columns].corr()
        correlations = []
        for i in range(len(correlation_matrix.columns)):
            for j in range(i+1, len(correlation_matrix.columns)):
                col1 = str(correlation_matrix.columns[i])
                col2 = str(correlation_matrix.columns[j])
                corr_value = correlation_matrix.iloc[i, j]
                if not pd.isna(corr_value):
                    correlations.append({
                        "column1": col1,
                        "column2": col2,
                        "correlation": float(corr_value)
                    })
        
        correlations.sort(key=lambda x: abs(x["correlation"]), reverse=True)
        eda_results["correlation_analysis"] = {
            "top_correlations": correlations[:10],
            "strong_correlations": [c for c in correlations if abs(c["correlation"]) > 0.5]
        }
    
    # 6. Key Insights
    insights = []
    insights.append(f"Dataset contains {len(df)} records with {len(df.columns)} variables")
    
    missing_cols = eda_results["missing_data_analysis"]["columns_with_missing"]
    if missing_cols:
        insights.append(f"Missing data found in {len(missing_cols)} columns: {missing_cols}")
    
    if numeric_columns:
        insights.append(f"Numeric variables: {numeric_columns}")
        for col in numeric_columns[:3]:
            if col in eda_results["numeric_analysis"]:
                stats = eda_results["numeric_analysis"][col]
                insights.append(f"{col}: mean={stats['mean']:.2f}, range={stats['min']:.2f}-{stats['max']:.2f}")
    
    if categorical_columns:
        insights.append(f"Categorical variables: {categorical_columns}")
        for col in categorical_columns[:3]:
            if col in eda_results["categorical_analysis"]:
                stats = eda_results["categorical_analysis"][col]
                insights.append(f"{col}: {stats['unique_values']} unique values, most common: {stats['most_common']}")
    
    if eda_results["correlation_analysis"]:
        strong_corr = eda_results["correlation_analysis"]["strong_correlations"]
        if strong_corr:
            insights.append(f"Strong correlations found: {len(strong_corr)} pairs with |r| > 0.5")
    
    eda_results["key_insights"] = insights
    
    print(f"✅ EDA Complete! Generated {len(insights)} key insights")
    return eda_results

In [17]:
# Perform comprehensive EDA
print("🔍 Performing Exploratory Data Analysis...")
eda_results = perform_comprehensive_eda_fixed(df)

# Display key insights
print("\n📊 Key EDA Insights:")
for i, insight in enumerate(eda_results['key_insights'], 1):
    print(f"{i}. {insight}")

🔍 Performing Exploratory Data Analysis...
🔍 Performing comprehensive EDA...
✅ EDA Complete! Generated 11 key insights

📊 Key EDA Insights:
1. Dataset contains 169 records with 40 variables
2. Missing data found in 16 columns: ['CONTACTNO', 'DEATH/ ALIVE (OUTSIDE RX DETAILS)', 'TUMOR SIZE', 'SURGERY NAME', 'ATRX', 'MGMT', 'Mib Lebeling Index', 'RT DOSE', 'RT MODALITY', 'CCRT', 'RT COMPLETION DATE', 'ADJUVANT ChT', 'START DATE', 'END DATE', 'RECC STATUS', 'OS']
3. Numeric variables: ['time_in_days', 'time_in_mos', 'AGE', 'OS']
4. time_in_days: mean=748.77, range=-9.00-5294.00
5. time_in_mos: mean=24.96, range=-0.30-176.47
6. AGE: mean=44.49, range=5.74-79.82
7. Categorical variables: ['SL NO', 'DOCTOR_NAME', 'HOSPITAL_NUMBER', 'PATIENT_NAME', 'CONTACTNO', 'DEATH/ ALIVE (7 AUG 2024)', 'DEATH/ ALIVE (OUTSIDE RX DETAILS)', 'status', 'TUMOR SIZE', 'SEX', 'LOCATION', 'LOCATION.1', 'LATARALITY', 'HPR DETAILS', 'HPR', 'GRADE', 'PRESS_DIAGNOSIS', 'SURGERY NAME', 'SURGERY TYPE', 'GFAP', 'IDH1/ ID

In [18]:
def ask_ai_with_statistical_analysis_fixed(question, df, eda_results, model="tinyllama:latest"):
    """Ask AI question using pre-analyzed EDA and statistical results"""
    try:
        # Check if question asks for regression analysis
        regression_keywords = ['regression', 'correlation', 'relationship', 'linear']
        is_regression_question = any(keyword in question.lower() for keyword in regression_keywords)
        
        # Perform regression analysis if requested
        regression_results = None
        if is_regression_question:
            # Try to identify age and survival columns
            age_cols = [col for col in df.columns if 'age' in col.lower()]
            survival_cols = [col for col in df.columns if any(word in col.lower() for word in ['survival', 'death', 'alive', 'os', 'time'])]
            
            if age_cols and survival_cols:
                age_col = age_cols[0]
                survival_col = survival_cols[0]
                print(f"🔍 Performing regression analysis between {age_col} and {survival_col}...")
                regression_results = perform_simple_regression_analysis(df, age_col, survival_col)
        
        # Create comprehensive prompt
        prompt = f"""
        I have performed comprehensive exploratory data analysis and statistical tests on a dataset. Here are the results:

        DATASET OVERVIEW:
        - Total rows: {eda_results['dataset_overview']['total_rows']}
        - Total columns: {eda_results['dataset_overview']['total_columns']}
        - Column names: {eda_results['dataset_overview']['column_names']}
        - Data types: {eda_results['dataset_overview']['data_types']}

        KEY INSIGHTS FROM EDA:
        {chr(10).join(['• ' + insight for insight in eda_results['key_insights']])}

        NUMERIC VARIABLES ANALYSIS:
        {json.dumps(eda_results['numeric_analysis'], indent=2) if eda_results['numeric_analysis'] else 'No numeric variables found'}

        CATEGORICAL VARIABLES ANALYSIS:
        {json.dumps(eda_results['categorical_analysis'], indent=2) if eda_results['categorical_analysis'] else 'No categorical variables found'}

        CORRELATION ANALYSIS:
        {json.dumps(eda_results['correlation_analysis'], indent=2) if eda_results['correlation_analysis'] else 'No correlation analysis available'}

        MISSING DATA:
        {json.dumps(eda_results['missing_data_analysis'], indent=2)}
        """
        
        # Add regression results if available
        if regression_results and 'error' not in regression_results:
            prompt += f"""

        REGRESSION ANALYSIS RESULTS:
        {json.dumps(regression_results, indent=2)}
        """
        elif regression_results and 'error' in regression_results:
            prompt += f"""

        REGRESSION ANALYSIS ERROR:
        {regression_results['error']}
        """
        
        prompt += f"""

        USER QUESTION: {question}

        Based on the above analysis results, please provide a detailed and accurate answer to the user's question. 
        If regression analysis was performed, interpret the results including R-squared, correlation coefficient, and significance.
        Reference specific statistics, correlations, and patterns from the analysis above.
        Be precise and use the actual numbers and insights from the analysis.
        """
        
        # API call to model
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False,
            "options": {
                "num_ctx": 2048,
                "num_thread": 4,
                "temperature": 0.2
            }
        }
        
        print(f"🔍 Asking AI: {question}")
        start_time = time.time()
        
        response = requests.post(
            "http://localhost:11434/api/chat",
            json=payload,
            timeout=90
        )
        
        end_time = time.time()
        
        if response.status_code == 200:
            data = response.json()
            content = data.get("message", {}).get("content", "No response")
            print(f"📊 AI Response: {content}")
            print(f"⏱️ Time: {end_time - start_time:.1f} seconds")
            return content
        else:
            print(f"❌ Error: HTTP {response.status_code}")
            return None
            
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

In [20]:
# Test with regression question
if df is not None:
    question = "Do a regression analysis between age and survival and share the result with me"
    print(f"\n🔍 Testing enhanced analysis: {question}")
    ask_ai_with_statistical_analysis_fixed(question, df, eda_results)


🔍 Testing enhanced analysis: Do a regression analysis between age and survival and share the result with me
🔍 Performing regression analysis between AGE and HOSPITAL_NUMBER...
🔍 Asking AI: Do a regression analysis between age and survival and share the result with me
📊 AI Response: Regression analysis is a statistical technique used to predict outcomes based on observed variables. In this case, we are performing a regression analysis between age and survival using the dataset provided.

The results of the regression analysis show that age has a significant positive effect on survival (R-squared = 0.591715976331361, correlation coefficient = 0.82). This means that for every increase in age by one unit, the probability of surviving to the end of follow-up is increased by 8.2%.

To interpret these results, we can see that older patients have a higher chance of survival than younger ones. This finding is consistent with previous research on cancer survival and suggests that aging may play

In [22]:
print("\n�� Ready for Enhanced AI Analysis!")
print("\n💡 You can now ask any question using:")
print("   ask_ai_with_statistical_analysis('Your question here', df, eda_results)")
print("\nExample questions:")
print("- 'Do a regression analysis between age and survival'")
print("- 'What is the correlation between tumor size and survival time?'")
print("- 'Analyze the relationship between age and treatment outcomes'")
print("- 'What are the key factors affecting patient survival?'")
print("- 'Is there a significant relationship between gender and survival?'")


�� Ready for Enhanced AI Analysis!

💡 You can now ask any question using:
   ask_ai_with_statistical_analysis('Your question here', df, eda_results)

Example questions:
- 'Do a regression analysis between age and survival'
- 'What is the correlation between tumor size and survival time?'
- 'Analyze the relationship between age and treatment outcomes'
- 'What are the key factors affecting patient survival?'
- 'Is there a significant relationship between gender and survival?'
