In [14]:
import json
import pathlib
import textwrap

import google.generativeai as genai


from IPython.display import display
from IPython.display import Markdown

from google.api_core import retry

In [15]:
genai.configure(api_key="gemini_api_key")

In [16]:
model = genai.GenerativeModel(model_name='models/gemini-1.5-pro-latest')

In [17]:
# Read the pdf
import pymupdf
doc = pymupdf.open("MOM-arjtech meeting.pdf")

extracted_text = ""

for page in doc:
  extracted_text += page.get_text()
  extracted_text += "\n"

print(extracted_text[:1000])

 
Arjtech Private Ltd 
Project Progress Report: Customer Churn Prediction Engine 
Date: September 25, 2024
Prepared by: Arjun V 
Project Objective 
The objective of this project is to develop a Customer Churn Prediction Engine that will 
help Arjtech Private Ltd predict which customers are at risk of leaving the service. The 
engine will use machine learning models to identify patterns in customer behavior and 
engagement, helping to retain valuable customers and optimize marketing efforts. 
Current Status 
●
Phase 1: Requirement Analysis - Completed
The project requirements were analyzed, and key churn indicators were identified. 
This phase involved discussions with stakeholders and subject matter experts to 
understand business objectives, churn patterns, and the desired outputs of the 
model. 
●
Phase 2: Data Collection - Completed
Data collection from various sources, including customer activity, usage logs, 
feedback forms, and subscription details, has been completed. The collec

In [18]:
#current updates

current_structure = """{
  "ProjectTitle": "Customer Churn Prediction Engine",
  "Date": "September 25, 2024",
  "ProjectObjective": "Develop a Customer Churn Prediction Engine to predict which customers are at risk of churning and provide insights to optimize retention strategies.",
  "CurrentStatus": {
    "RequirementAnalysis": "Completed",
    "DataCollection": "Completed",
    "DataPreparation": {
      "Status": "In Progress",
      "Details": "Preliminary data cleaning done. Further feature engineering, scaling, and encoding categorical features ongoing."
    },
    "ModelSelectionAndTraining": {
      "Status": "Upcoming",
      "Details": "Logistic Regression as baseline, followed by Random Forest and Gradient Boosting models."
    }
  },
  "TeamMembers": {
    "ProjectLead": "Lead Name",
    "DataScientist": "Your Name",
    "MLEngineer": "Engineer Name",
    "DataEngineer": "Engineer Name",
    "BusinessAnalyst": "Analyst Name",
    "SoftwareEngineer": "Engineer Name"
  },
  "CompletedTasks": [
    "Requirement analysis meetings with stakeholders.",
    "Data collection from internal and external data sources.",
    "Initial data cleaning and preprocessing."
  ],
  "NewlyAssignedTasks": {
    "DataScientist": "Continue data preprocessing and feature engineering.",
    "MLEngineer": "Explore various machine learning models and finalize shortlist for model training.",
    "DataEngineer": "Set up infrastructure for data storage and handling during model training.",
    "SoftwareEngineer": "Begin planning deployment pipelines for model integration."
  },
  "MinutesOfMeeting": {
    "Date": "September 24, 2024",
    "Attendees": ["List of Attendees"],
    "Agenda": [
      "Review current progress.",
      "Discuss upcoming tasks related to model selection and training.",
      "Address roadblocks in data preparation."
    ],
    "DiscussionPoints": [
      {
        "Topic": "Data Collection",
        "Details": "Data collection completed. Missing values handled."
      },
      {
        "Topic": "Data Preparation",
        "Details": "Feature engineering, handling categorical variables, and scaling are in progress. Challenges in class imbalance discussed."
      },
      {
        "Topic": "Model Selection",
        "Details": "Logistic Regression will be the baseline, followed by Random Forest and Gradient Boosting."
      },
      {
        "Topic": "Timeline",
        "Details": "Next review scheduled after model training completion."
      }
    ],
    "ActionItems": {
      "DataScientist": "Finalize data preparation and start exploratory analysis.",
      "MLEngineer": "Start model selection with Logistic Regression.",
      "DataEngineer": "Ensure preprocessed data is ready for training.",
      "ProjectLead": "Schedule model performance review session."
    },
    "NextMeeting": "October 2, 2024"
  }
}"""

In [19]:
# define expected structure
# structuring extracted text


def return_req_structure():

  struct = """{
    "ProjectTitle": "Project Title",
    "Date": "YYYY-MM-DD",
    "ProjectObjective": "Brief description of the project objective.",
    "CurrentStatus": {
      "RequirementAnalysis": "Status of requirement analysis (e.g., Completed, In Progress, Not Started)",
      "DataCollection": "Status of data collection (e.g., Completed, In Progress, Not Started)",
      "DataPreparation": {
        "Status": "Status of data preparation (e.g., Completed, In Progress, Not Started)",
      },
      "ModelSelectionAndTraining": {
        "Status": "Status of model selection and training (e.g., Upcoming, In Progress, Completed)",
        "Details": "Details regarding the model selection process."
      }
    },
    "TeamMembers": ["List team member roles"],
    "CompletedTasks": [
      "Task 1 description",
      "Task 2 description",
      "Task 3 description"
    ],
    "NewlyAssignedTasks": {
      "DataScientist": "New task for the data scientist",
      "MLEngineer": "New task for the machine learning engineer",
      "DataEngineer": "New task for the data engineer",
      "SoftwareEngineer": "New task for the software engineer"
    },
    "MinutesOfMeeting": {
      "Date": "YYYY-MM-DD",
      "Attendees": ["List of roles of attendees"],
      "Agenda": [
        "Agenda item 1",
        "Agenda item 2",
        "Agenda item 3"
      ],
      "DiscussionPoints": [
        {
          "Topic": "Topic 1",
          "Details": "Discussion details for topic 1."
        },
        {
          "Topic": "Topic 2",
          "Details": "Discussion details for topic 2."
        },
        {
          "Topic": "Topic 3",
          "Details": "Discussion details for topic 3."
        }
      ],
      "ActionItems": {
        "DataScientist": "Action item for the data scientist.",
        "MLEngineer": "Action item for the machine learning engineer.",
        "DataEngineer": "Action item for the data engineer.",
        "ProjectLead": "Action item for the project lead."
      },
      "NextMeeting": "Next meeting date"
    }
  }
  """

  return struct

def extract_text_in_structure_latest_description(text):

  structure = return_req_structure()

  response = model.generate_content(
  textwrap.dedent(f"""
  From the text describing the latest meeting regarding the project {extracted_text} return JSON describing the project details using the following schema:
  {structure}

    All fields are required.

    Do not Hallucinate

    If a field is empty keep it empty/null

    Important: Only return a single piece of valid JSON text.

    Here is the text:

    """) + text,
  generation_config={'response_mime_type':'application/json'}
  )

  resp = json.dumps(json.loads(response.text), indent=4)

  return resp

In [20]:
latest_status_description = """In the follow-up meeting held on October 2, 2024, the team reviewed
 the successful completion of the model selection and training phase for the Customer
 Churn Prediction Engine. After evaluating multiple models, including Logistic Regression,
 Random Forest, and XGBoost, the team selected XGBoost as the final model due to its superior performance,
 achieving an accuracy of 85%, a recall of 82%, and an AUC-ROC score of 0.89. SHAP analysis was used to provide
 transparency in the model’s predictions, revealing key churn indicators such as customer engagement frequency
 and subscription tenure. The team discussed challenges such as class imbalance, which was addressed using SMOTE,
 and overfitting risks, mitigated through regularization and cross-validation. Moving forward, the team will fine-tune
 the model, conduct final validation, and begin deployment planning. The software and ML engineers will develop APIs to
 integrate the model into the CRM system, while an automated retraining pipeline will be implemented to keep the
 model updated every 14 days. The business analyst will collaborate with the marketing team to apply the insights for
 targeted retention strategies. The next meeting is scheduled for October 16, 2024
, to finalize deployment plans and align business strategies with the model’s outputs."""

In [21]:
structured_latest_status_description = extract_text_in_structure_latest_description(latest_status_description)

{'ProjectTitle': 'Customer Churn Prediction Engine',
 'Date': '2024-09-25',
 'ProjectObjective': 'The objective of this project is to develop a Customer Churn Prediction Engine that will help Arjtech Private Ltd predict which customers are at risk of leaving the service. The engine will use machine learning models to identify patterns in customer behavior and engagement, helping to retain valuable customers and optimize marketing efforts.',
 'CurrentStatus': {'RequirementAnalysis': 'Completed',
  'DataCollection': 'Completed',
  'DataPreparation': {'Status': 'In Progress'},
  'ModelSelectionAndTraining': {'Status': 'Completed',
   'Details': 'Logistic Regression was used as the baseline model, followed by ensemble methods like Random Forest and Gradient Boosting. XGBoost was selected as the final model due to its superior performance, achieving an accuracy of 85%, a recall of 82%, and an AUC-ROC score of 0.89. SHAP analysis was used for model explainability.'}},
 'TeamMembers': ['Proje

In [22]:
latest = json.loads(structured_latest_status_description)

In [24]:
latest.keys()

dict_keys(['ProjectTitle', 'Date', 'ProjectObjective', 'CurrentStatus', 'TeamMembers', 'CompletedTasks', 'NewlyAssignedTasks', 'MinutesOfMeeting'])

In [25]:
# update latest info
import copy

def update_latest_info(current_data, latest_data):

  current_data = json.loads(current_data)
  latest_data = json.loads(latest_data)

  updated_data = copy.deepcopy(current_data)

  available_keys = ['ProjectTitle', 'Date', 'ProjectObjective', 'CurrentStatus'
  , 'TeamMembers', 'CompletedTasks', 'NewlyAssignedTasks', 'MinutesOfMeeting']

  mutable_keys = ['CurrentStatus', 'CompletedTasks', 'NewlyAssignedTasks', 'MinutesOfMeeting']


  for key, val in current_data.items():
    if key in mutable_keys:
      updated_data[key] = latest_data[key]

  return updated_data

In [28]:
updated_data = update_latest_info(current_structure, structured_latest_status_description)
print(updated_data)

{'ProjectTitle': 'Customer Churn Prediction Engine', 'Date': 'September 25, 2024', 'ProjectObjective': 'Develop a Customer Churn Prediction Engine to predict which customers are at risk of churning and provide insights to optimize retention strategies.', 'CurrentStatus': {'RequirementAnalysis': 'Completed', 'DataCollection': 'Completed', 'DataPreparation': {'Status': 'In Progress'}, 'ModelSelectionAndTraining': {'Status': 'Completed', 'Details': 'Logistic Regression was used as the baseline model, followed by ensemble methods like Random Forest and Gradient Boosting. XGBoost was selected as the final model due to its superior performance, achieving an accuracy of 85%, a recall of 82%, and an AUC-ROC score of 0.89. SHAP analysis was used for model explainability.'}}, 'TeamMembers': {'ProjectLead': 'Lead Name', 'DataScientist': 'Your Name', 'MLEngineer': 'Engineer Name', 'DataEngineer': 'Engineer Name', 'BusinessAnalyst': 'Analyst Name', 'SoftwareEngineer': 'Engineer Name'}, 'CompletedTa

In [29]:
updated_data

{'ProjectTitle': 'Customer Churn Prediction Engine',
 'Date': 'September 25, 2024',
 'ProjectObjective': 'Develop a Customer Churn Prediction Engine to predict which customers are at risk of churning and provide insights to optimize retention strategies.',
 'CurrentStatus': {'RequirementAnalysis': 'Completed',
  'DataCollection': 'Completed',
  'DataPreparation': {'Status': 'In Progress'},
  'ModelSelectionAndTraining': {'Status': 'Completed',
   'Details': 'Logistic Regression was used as the baseline model, followed by ensemble methods like Random Forest and Gradient Boosting. XGBoost was selected as the final model due to its superior performance, achieving an accuracy of 85%, a recall of 82%, and an AUC-ROC score of 0.89. SHAP analysis was used for model explainability.'}},
 'TeamMembers': {'ProjectLead': 'Lead Name',
  'DataScientist': 'Your Name',
  'MLEngineer': 'Engineer Name',
  'DataEngineer': 'Engineer Name',
  'BusinessAnalyst': 'Analyst Name',
  'SoftwareEngineer': 'Engine