In [1]:
import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with your actual dataset file)
data = pd.read_csv('f:\\indira college hackathon\\email.csv')

# Since PandasAI cannot be imported, we will skip its initialization and visualization
# Instead, we can display the first few rows of the dataset as a simple visualization
print(data.head())


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
import pandas as pd
import os
from pandasai import Agent

# Set the API key for PandasAI
os.environ['PANDASAI_API_KEY'] = 'your_api_key_here'  # Replace 'your_api_key_here' with your actual API key

# Create a DataFrame containing sales data by country
sales_by_country = pd.DataFrame({
    "country": ["United States", "United Kingdom", "France", "Germany", "Italy", "Spain", "Canada", "Australia", "Japan", "China"],
    "sales": [5000, 3200, 2900, 4100, 2300, 2100, 2500, 2600, 4500, 7000]
})

# Initialize the PandasAI agent with the sales data
agent = Agent(sales_by_country)

# Use the agent to query the top 5 countries by sales
top_countries = agent.chat('Which are the top 5 countries by sales?')
# Expected Output: China, United States, Japan, Germany, Australia


Traceback (most recent call last):
  File "c:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandasai\pipelines\chat\generate_chat_pipeline.py", line 335, in run
    ).run(input)
      ^^^^^^^^^^
  File "c:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandasai\pipelines\pipeline.py", line 137, in run
    raise e
  File "c:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandasai\pipelines\pipeline.py", line 101, in run
    step_output = logic.execute(
                  ^^^^^^^^^^^^^^
  File "c:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandasai\pipelines\chat\code_generator.py", line 33, in execute
    code = pipeline_context.config.llm.generate_code(input, pipeline_context)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandasai\llm\base.py", line 201, in generate_code
    response = self.call

Exception in APILogger: {"message":"Invalid API Key!","data":null}


In [3]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Split the dataset into features and target variable
X = sales_by_country[['sales']]
y = sales_by_country['country']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Start MLflow run
with mlflow.start_run():
    # Initialize and train the model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Decode the predictions back to original labels
    predictions_decoded = label_encoder.inverse_transform(predictions.astype(int))

    # Calculate and log the mean squared error
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)

    # Log the model with input example for signature inference
    mlflow.sklearn.log_model(model, "model", input_example=X_test.iloc[0].to_dict())




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
# Production-grade issues faced by developers in building data pipelines include:


# 1. Data Quality: Ensuring data consistency and integrity is crucial, as poor data quality can lead to inaccurate insights and decisions.
# 2. Scalability: As data volumes grow, pipelines must efficiently scale to handle increased loads without performance degradation.
# 3. Versioning: Managing versions of both data and models is essential to track changes and ensure reproducibility in experiments.
# 4. Monitoring: Continuous monitoring of pipeline performance is necessary to detect and address issues in real-time, ensuring reliability.
# 5. Collaboration: Effective collaboration among team members is vital, requiring tools that facilitate sharing of code, experiments, and results.


# Strategies to address these production-grade issues using MLflow include:

# 1. Data Quality: Implement logging capabilities in MLflow to track and validate data quality metrics.
# 2. Scalability: Utilize MLflow's integration with cloud services to dynamically scale resources based on demand.
# 3. Versioning: Leverage MLflow's model registry for effective management and versioning of models.
# 4. Monitoring: Use MLflow's tracking server to log and monitor pipeline performance metrics.
# 5. Collaboration: Employ MLflow's centralized tracking system to enhance team collaboration on experiments and results.

# Example code to log data quality metrics and manage model versioning:


In [5]:
def log_data_quality_metrics(data):
    # Function to log data quality metrics
    missing_values = data.isnull().sum()
    mlflow.log_metrics({"missing_values": missing_values.sum()})
    
    # Log additional data quality metrics
    data_types = data.dtypes.to_dict()
    mlflow.log_params({"data_types": data_types})

# Call the function to log data quality metrics
log_data_quality_metrics(sales_by_country)
