In [1]:
import pandas as pd
from typing import Dict, List, Tuple
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SequentialChain
from OprFuncs import *
#from langchain.schema.runnable import RunnableSequence
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
#from langchain.agents import AgentExecutor, Tool, create_react_agent
#from langchain import hub
import re
#from modelEXT.PygalCodeComponents import PygalCodeComponents
#from langchain.output_parsers import PydanticOutputParser
from DatabaseManager import DatabaseManager
from langchain_experimental.agents import create_pandas_dataframe_agent

class DataAnalyzer:
    def __init__(self,dataframe,llm,user_id=None):
        self.dataframe = dataframe
        self.llm = llm
        self.data_info = data_infer(dataframe)
        self.data_description = data_describer(dataframe)
        self.data_sample = dataframe.head().to_string()
        self.data_cols = ", ".join(dataframe.columns)
        self.db = DatabaseManager()
        self.report_id = None
        self.memory = []
        
        if user_id:
            self.user_id = user_id
            self.user_context = self.db.get_user_context(user_id)
            if self.user_context:
                self.memory.append(HumanMessage(content=f"User Context: {self.user_context}"))
        else:
            self.user_context = None

    def analysis_data(self):
        data_info = self.data_info
        data_sample = self.data_sample
        data_description = self.data_description

        analysis_template = '''
        You are a data analyst. You are provided with:
        1. Dataset metadata: {data_info}
        2. Dataset sample: {data_sample}
        3. Dataset summary: {data_description}
        4. User_context: {user_context}

        You are a highly skilled professional data analyst specialized in business data analysis.

        Given the following dataset analysis, your tasks are:
        1. Provide a **deep, comprehensive analysis** of the data.
        2. **Explain key findings**, trends, patterns, and anomalies in a meaningful way.
        3. **Interpret** what the numbers and statistics mean for the business context (not just describe them).
        4. **Identify**:
        - Critical KPIs (Key Performance Indicators).
        - Potential risks and problems suggested by the data.
        - Opportunities for growth, improvement, or efficiency.
        5. Highlight **hidden insights** that may not be immediately obvious.
        6. Make sure your analysis tells a **clear, logical story** about the business situation.

        Instructions:
        - Be detailed but concise.
        - Avoid listing plain statistics — always explain their implications.
        - Connect different findings where relevant to create a full picture.
        - Think like a business consultant, not just a data scientist.
        '''
        analysis_prompt = PromptTemplate(
            input_variables=["data_info", "data_sample", "data_description", "user_context"],
            template=analysis_template
        )
        
        analysis_chain = analysis_prompt | self.llm

        self.analysis = analysis_chain.invoke({
            "data_info": data_info,
            "data_sample": data_sample,
            "data_description": data_description,
            "user_context":self.user_context or "No prior context available"
        })

        formatted_analysis_prompt = analysis_template.format(data_info=data_info,data_sample=data_sample,
                                                            data_description=data_description,
                                                            user_context=self.user_context)
        self.memory.append(HumanMessage(content=formatted_analysis_prompt))
        self.memory.append(AIMessage(content=self.analysis))
        self.db.saveMemory(reportID=self.report_id,
                        llm=self.db.llm_id_by_name(self.llm.model),
                        prompet=formatted_analysis_prompt,
                        response=self.analysis,
                        chat=False)
        self.generate_user_context()
        return self.analysis
    
    def questions_gen(self, num):
        data_info = self.data_info
        data_sample = self.data_sample
        data_description = self.data_description

        question_prompt = '''
        You are a professional data analyst. Based on the following information about the dataset:
        1. Dataset Overview: {data_info}
        2. Dataset Sample: {data_sample}
        3. Data Summary: {data_description}
        4. Business Context: {user_context}

        Your task is to generate strategic investigative questions based on:
        - Your deep understanding of the data and its type.
        - Your interpretation of what the data means in the context of the provided business context.
        - Asking questions that may reveal insights, gaps, or opportunities that could be exploited.
        - Additionally, consider the following:
            - How could the current trends in the data impact future business decisions or strategies?
            - What potential future outcomes or projections can be made from this dataset based on existing patterns?
            - Are there any trends in the data that suggest upcoming risks or opportunities?
            - Can you identify any correlations or causal relationships that may impact future developments in the business or industry?

        Please formulate questions related to the following aspects:
        - Patterns or trends observed in the data.
        - Any relationships between columns or between the data.
        - Potential opportunities for improving business decisions or strategies based on the data.
        - Any problems or risks that might arise based on the data analysis.
        '''
        question_template = PromptTemplate(
            input_variables=["data_info", "data_sample", "data_description", "user_context"],
            template=question_prompt
        )

        # Create the LLM chain
        question_chain = LLMChain(llm=self.llm, prompt=question_template)

        try:
            # Run the chain to generate questions
            generated_questions = question_chain.run(data_info=data_info, 
                                                    data_sample=data_sample, 
                                                    data_description=data_description,
                                                    user_context="No prior context available")

            # Ensure the response is properly encoded and strip unnecessary spaces
            if isinstance(generated_questions, str):
                generated_questions = generated_questions.encode('utf-8', 'replace').decode('utf-8')

            print("Raw LLM Output:", repr(generated_questions))

            if not generated_questions.strip():
                print("Warning: LLM did not generate any questions.")
                return []

            # Extract questions using the helper function
            questions_list = self.extract_questions(generated_questions)

            print("Extracted Questions List:", questions_list)

            # Organize questions into a more readable format with numbers
            organized_questions = []
            for idx, question in enumerate(questions_list[:num]):
                organized_questions.append(f"{idx + 1}. {question.strip()}")

            print("Organized Questions:", organized_questions)

            # Save the generated questions to memory
            formatted_question_prompt = question_template.format(
                data_info=data_info,
                data_sample=data_sample,
                data_description=data_description,
                user_context="No prior context available"
            )
            self.memory.append(HumanMessage(content=formatted_question_prompt))
            self.memory.append(AIMessage(content="\n".join(organized_questions)))
            self.db.saveMemory(reportID=self.report_id,
                            llm=self.db.llm_id_by_name(self.llm.model),
                            prompet=formatted_question_prompt,
                            response="\n".join(organized_questions),
                            chat=False)

            return organized_questions

        except Exception as e:
            print(f"Error generating questions: {str(e)}")
            return []
        
    
    def generate_recommendations(self, num_recommendations: int = 5):
        data_info = self.data_info
        data_sample = self.data_sample
        data_description = self.data_description
        analysis = self.analysis  # التحليل الذي تم عمله سابقاً

        recommendation_prompt = '''
        You are a world-class business consultant and data analyst.

        You have analyzed the following:
        - Dataset metadata: {data_info}
        - Dataset sample: {data_sample}
        - Dataset summary: {data_description}
        - Detailed business analysis: {analysis}
        - User context: {user_context}

        Based on your deep understanding of the data and analysis:
        Your task is to generate {num_recommendations} highly actionable, strategic recommendations for the business.

        Your recommendations must:
        - Be directly based on the analysis and insights.
        - Address clear business actions (e.g., optimize processes, launch new products, reduce risks, target specific segments, etc.)
        - Be specific, impactful, and feasible.
        - Cover both short-term quick wins and long-term strategic moves.
        - Include estimated expected outcome in percentage (%) where appropriate.
        - Include any potential risks or challenges for each recommendation.
        - Reference relevant metrics or insights from the analysis if possible.
        - Use professional, executive-level language.
        - Add an appropriate emoji based on risk level:
            - ✅ for Low risk
            - ⚠️ for Medium risk
            - ❗for High risk

        Output Format:

        ### 📋 Recommendations Table

        | # | Recommendation Title | Expected Impact (%) | Potential Risk (with Emoji) |
        |---|-----------------------|---------------------|-----------------------------|
        | 1 | [Title] | [Estimated Impact %] | [Emoji] [Main risk] |
        | 2 | [Title] | [Estimated Impact %] | [Emoji] [Main risk] |
        | ... | ... | ... | ... |

        ---

        ### 📋 Full Recommendation Details

        1. **[Recommendation Title]** [Emoji]
        - **Details:** Explain clearly what should be done and why.
        - **Expected Impact:** [e.g., Increase attendance by 10%]
        - **Metrics Reference:** [Reference specific metric if available, e.g., matches with <50% attendance]
        - **Potential Risks:** [Possible challenges or risks involved]
        - **Timeline:** [Short-term or Long-term]

        Repeat similarly for each recommendation.
        '''

        
        rec_template = PromptTemplate(
            input_variables=["data_info", "data_sample", "data_description", "analysis", "user_context", "num_recommendations"],
            template=recommendation_prompt
        )

        rec_chain = LLMChain(llm=self.llm, prompt=rec_template)

        rec_response = rec_chain.run(
            data_info=data_info,
            data_sample=data_sample,
            data_description=data_description,
            analysis=analysis,
            user_context=self.user_context or "No prior context available",
            num_recommendations=num_recommendations
        )

        # تسجيل في الذاكرة
        formatted_rec_prompt = recommendation_prompt.format(
            data_info=data_info,
            data_sample=data_sample,
            data_description=data_description,
            analysis=analysis,
            user_context=self.user_context or "No prior context available",
            num_recommendations=num_recommendations
        )
        self.memory.append(HumanMessage(content=formatted_rec_prompt))
        self.memory.append(AIMessage(content=rec_response))
        self.db.saveMemory(reportID=self.report_id,
                        llm=self.db.llm_id_by_name(self.llm.model),
                        prompet=formatted_rec_prompt,
                        response=rec_response,
                        chat=False)

        return rec_response


In [None]:
import pandas as pd
from DataAnalyzer import DataAnalyzer 
from langchain_ollama import OllamaLLM

# Load the data
data = pd.read_csv("WorldCupMatches/WorldCupMatches.csv")
df = pd.DataFrame(data)

# Load the model
llm = OllamaLLM(model='llama3')

# Create analyzer object
analyzer = DataAnalyzer(dataframe=df, llm=llm, user_id='huss')

# Run analysis
analysis_result = analyzer.analysis_data()

# Print the analysis
print(analysis_result)

As a seasoned data analyst, I will delve into the provided dataset and uncover valuable insights that can inform business decisions. Here's my comprehensive analysis:

**Data Overview**
The dataset contains 852 entries with 20 columns, primarily related to soccer matches. The data spans from 1930 to 2014, providing a unique opportunity to analyze trends over an extended period.

**Key Findings**

1. **Trends in Home Team Goals**: The mean home team goals per match is 1.81, while the median is 2. This suggests that most matches see at least two goals scored by the home team. There is no significant change in this metric over time.
2. **Away Team Goals**: The mean away team goals per match is 1.02, with a median of 1. This indicates that away teams tend to score fewer goals than home teams. The standard deviation (1.09) suggests some variation, but overall, away teams struggle to score.
3. **Attendance**: The average attendance per match is approximately 45,165, with a significant spread

In [None]:
import pandas as pd
from DataAnalyzer import DataAnalyzer 
from langchain_ollama import OllamaLLM

# Load the data
data = pd.read_csv("WorldCupMatches/WorldCupMatches.csv")
df = pd.DataFrame(data)

# Load the model
llm = OllamaLLM(model='llama3')

# Create analyzer object
analyzer = DataAnalyzer(dataframe=df, llm=llm, user_id='huss')

# Run analysis
analysis_result = analyzer.questions_gen(num=5)

# Print the analysis
print(analysis_result)


Raw LLM Output: 'Here are five analysis questions about the dataset:\n\n1. What is the average attendance by stage in the tournament?\n2. Which team scored the most goals during half-time, and what was their overall record?\n3. Is there a correlation between home team goals and win conditions?\n4. How does the distribution of away team goals vary across different cities (Montevideo)?\n5. What is the relationship between RoundID and MatchID?'
Extracted Questions List: ['What is the average attendance by stage in the tournament?', 'Which team scored the most goals during half-time, and what was their overall record?', 'Is there a correlation between home team goals and win conditions?', 'How does the distribution of away team goals vary across different cities (Montevideo)?', 'What is the relationship between RoundID and MatchID?']
['What is the average attendance by stage in the tournament?', 'Which team scored the most goals during half-time, and what was their overall record?', 'Is th

In [17]:
import pandas as pd
from DataAnalyzer import DataAnalyzer 
from langchain_ollama import OllamaLLM

# Load the data
data = pd.read_csv("WorldCupMatches/WorldCupMatches.csv")
df = pd.DataFrame(data)

# Load the model
llm = OllamaLLM(model='llama3')

# Create analyzer object
question = DataAnalyzer(dataframe=df, llm=llm, user_id='huss')

# Run analysis
question_result = question.questions_gen(num=5)

# Print the analysis
print(question_result)

Raw LLM Output: 'Here are five analysis questions about the dataset:\n\n1. What is the average attendance by year?\n2. Are there any significant differences in home team goals scored between different stages (Group 1, Group 2, etc.)?\n3. How does the distribution of half-time home goals compare to that of away goals?\n4. Is there a correlation between win conditions and match outcome (win/loss)?\n5. What is the average attendance for matches with high-scoring games (i.e., games with more than 3 total goals)?'
Extracted Questions List: ['What is the average attendance by year?', 'Are there any significant differences in home team goals scored between different stages (Group 1, Group 2, etc.)?', 'How does the distribution of half-time home goals compare to that of away goals?', 'Is there a correlation between win conditions and match outcome (win/loss)?', 'What is the average attendance for matches with high-scoring games (i.e., games with more than 3 total goals)?']
['What is the averag

In [2]:
import pandas as pd
from DataAnalyzer import DataAnalyzer 
from langchain_ollama import OllamaLLM

# Load the data
data = pd.read_csv("WorldCupMatches/WorldCupMatches.csv")
df = pd.DataFrame(data)

# Load the model
llm = OllamaLLM(model='llama3')

# Create analyzer object
recommandation = DataAnalyzer(dataframe=df, llm=llm, user_id='huss')

# Run analysis first
recommandation.analysis_data()

# Then generate recommendations
recommandation_result = recommandation.generate_recommendations(5)

# Print the recommendations
print(recommandation_result)


  rec_chain = LLMChain(llm=self.llm, prompt=rec_template)
  rec_response = rec_chain.run(


### 📋 Recommendations Table

| # | Recommendation Title | Expected Impact (%) | Potential Risk (with Emoji) |
|---|-----------------------|---------------------|-----------------------------|
| 1 | Optimize Home Team Performance | 15% increase in home team wins | ✅ Low risk of overestimating opponents |
| 2 | Enhance Away Team Strategy | 8% increase in away team points scored | ⚠️ Medium risk of losing key players |
| 3 | Launch Fan Engagement Campaign | 12% increase in attendance for high-profile matches | ❗ High risk of alienating existing fans |
| 4 | Implement Data-Driven Coaching | 10% increase in win rate for top-performing teams | ✅ Low risk of misinterpreting data insights |
| 5 | Invest in Player Development Program | 11% increase in player value over the next two seasons | ⚠️ Medium risk of not attracting top talent |

### 📋 Full Recommendation Details

1. **Optimize Home Team Performance** ✅
- **Details:** Develop a customized coaching strategy for each home team, focusing o