# Task 1: Data Preparation & Multi-Label Text Classification
### 1. Generate the synthetic data for calls_dataset.csv and domain_knowledge.json.

In [None]:
import pandas as pd
import random

# Define categories and sample data
categories = ["Objection", "Pricing Discussion", "Security", "Competition"]
sample_snippets = [
    "We love the analytics, but CompetitorX has a cheaper subscription.",
    "Our compliance team is worried about data handling. Are you SOC2 certified?",
    "Can you offer any discounts on your current pricing model?",
    "CompetitorY provides similar features at a lower cost.",
    "We need more information about the AI engine before proceeding.",
]

# Generate synthetic dataset
data = []
for i in range(1, 201):  # Generate 200 rows
    snippet = random.choice(sample_snippets)
    labels = random.sample(categories, random.randint(1, len(categories)))  # Random labels
    data.append({"id": i, "text_snippet": snippet, "labels": ", ".join(labels)})

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("calls_dataset.csv", index=False)
print("Synthetic dataset saved as calls_dataset.csv.")


Synthetic dataset saved as calls_dataset.csv.


In [None]:
print(df.head())

   id                                       text_snippet  \
0   1  We need more information about the AI engine b...   
1   2  We love the analytics, but CompetitorX has a c...   
2   3  We need more information about the AI engine b...   
3   4  Can you offer any discounts on your current pr...   
4   5  Our compliance team is worried about data hand...   

                                              labels  
0                             Competition, Objection  
1          Pricing Discussion, Competition, Security  
2                                          Objection  
3                                          Objection  
4  Objection, Security, Pricing Discussion, Compe...  


###  2. Clean & Preprocess the text: Removing stop words.

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

import nltk
nltk.download('wordnet') # Download the wordnet dataset
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("calls_dataset.csv")

# Text preprocessing function
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df["cleaned_text"] = df["text_snippet"].apply(preprocess_text)

# Print the cleaned dataset
print(df[["text_snippet", "cleaned_text"]].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...


                                        text_snippet  \
0  Can you offer any discounts on your current pr...   
1  We need more information about the AI engine b...   
2  Can you offer any discounts on your current pr...   
3  We love the analytics, but CompetitorX has a c...   
4  Can you offer any discounts on your current pr...   

                                      cleaned_text  
0             offer discount current pricing model  
1            need information ai engine proceeding  
2             offer discount current pricing model  
3  love analytics competitorx cheaper subscription  
4             offer discount current pricing model  


### 3. Split your data into training and validation/test.

In [None]:
# Split into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)
print("Train and test datasets created.")

Train and test datasets created.


### 4. Choose a suitable multi-label classification approach: Logistic regression has been choosen

### 5.Training the model:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Vectorize the text
vectorizer = TfidfVectorizer(max_features=500)
X_train = vectorizer.fit_transform(train_data["cleaned_text"])
X_test = vectorizer.transform(test_data["cleaned_text"])

# Encode labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data["labels"].str.split(", "))
y_test = mlb.transform(test_data["labels"].str.split(", "))

# Train the model
classifier = OneVsRestClassifier(LogisticRegression())
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


                    precision    recall  f1-score   support

       Competition       0.70      1.00      0.82        28
         Objection       0.70      1.00      0.82        28
Pricing Discussion       0.36      0.44      0.40        18
          Security       0.68      1.00      0.81        27

         micro avg       0.64      0.90      0.75       101
         macro avg       0.61      0.86      0.71       101
      weighted avg       0.63      0.90      0.74       101
       samples avg       0.65      0.93      0.72       101



#### with hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Vectorize the text with TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=500)
X_train = vectorizer.fit_transform(train_data["cleaned_text"])
X_test = vectorizer.transform(test_data["cleaned_text"])

# Encode labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data["labels"].str.split(", "))
y_test = mlb.transform(test_data["labels"].str.split(", "))

# Define parameter grid for Logistic Regression
param_grid = {
    "estimator__C": [0.01, 0.1, 1, 10],       # Regularization strength
    "estimator__penalty": ["l2"],             # Regularization type
    "estimator__solver": ["lbfgs", "saga"],   # Solvers
}

# Set up Logistic Regression and OneVsRestClassifier
logistic = LogisticRegression(max_iter=1000)
classifier = OneVsRestClassifier(logistic)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=param_grid,
    scoring="f1_micro",       # Use f1_micro as the scoring metric
    cv=3,                     # Cross-validation folds
    verbose=1,
    n_jobs=-1                 # Parallel processing
)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate the model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters: {'estimator__C': 0.01, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}
                    precision    recall  f1-score   support

       Competition       0.68      1.00      0.81        27
         Objection       0.60      1.00      0.75        24
Pricing Discussion       0.60      1.00      0.75        24
          Security       0.65      1.00      0.79        26

         micro avg       0.63      1.00      0.77       101
         macro avg       0.63      1.00      0.77       101
      weighted avg       0.63      1.00      0.77       101
       samples avg       0.63      1.00      0.74       101



### 5.b Cross Validation: used K- Fold Cross Validation Technique

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Encode labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["labels"].str.split(", "))

# Vectorize the text
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))

# Convert text data into features
X = vectorizer.fit_transform(df["cleaned_text"])

# Define the model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, C=1.0))

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []

for train_index, test_index in kfold.split(X, y):
    # Split data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate performance using F1-score
    fold_f1 = f1_score(y_test, y_pred, average="micro")
    f1_scores.append(fold_f1)

    print(f"Fold {fold} F1-Score: {fold_f1:.4f}")
    fold += 1

# Display overall results
print("\nK-Fold Cross-Validation Results")
print(f"Mean F1-Score: {np.mean(f1_scores):.4f}")
print(f"Standard Deviation: {np.std(f1_scores):.4f}")

Fold 1 F1-Score: 0.7490
Fold 2 F1-Score: 0.7131
Fold 3 F1-Score: 0.7325
Fold 4 F1-Score: 0.6987
Fold 5 F1-Score: 0.7750

K-Fold Cross-Validation Results
Mean F1-Score: 0.7337
Standard Deviation: 0.0268


### 6. Evaluate your model:

### a.  precision, recall, F1-score per label.

In [None]:
# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))



                    precision    recall  f1-score   support

       Competition       0.68      1.00      0.81        27
         Objection       0.61      0.71      0.65        24
Pricing Discussion       0.62      0.88      0.72        24
          Security       0.65      1.00      0.79        26

         micro avg       0.64      0.90      0.75       101
         macro avg       0.64      0.90      0.74       101
      weighted avg       0.64      0.90      0.75       101
       samples avg       0.64      0.89      0.71       101



### b. a confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Evaluate the model
y_pred = classifier.predict(X_test)

# Print confusion matrix for each label
print("Confusion Matrix Results:\n")
for i, label in enumerate(mlb.classes_):
    cm = confusion_matrix(y_test[:, i], y_pred[:, i])  # Confusion matrix for each label
    print(f"Confusion Matrix for Label: {label}")
    print(cm)
    print("\n")


Confusion Matrix Results:

Confusion Matrix for Label: Competition
[[ 0 13]
 [ 0 27]]


Confusion Matrix for Label: Objection
[[ 5 11]
 [ 7 17]]


Confusion Matrix for Label: Pricing Discussion
[[ 3 13]
 [ 3 21]]


Confusion Matrix for Label: Security
[[ 0 14]
 [ 0 26]]




# Task 2: Entity/Keyword Extraction with a Domain Knowledge Base

## 1. Dictionary Lookup:  Using domain_knowledge.json to search for known competitor names, product features, or pricing keywords in the text.

In [None]:
import json
import pandas as pd
import re
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv("calls_dataset.csv")

# Load the domain knowledge base
domain_knowledge = {
    "competitors": ["CompetitorX", "CompetitorY", "CompetitorZ"],
    "features": ["analytics", "AI engine", "data pipeline"],
    "pricing_keywords": ["discount", "renewal cost", "budget", "pricing model"]
}


 ## 2. NER or Advanced Extraction:

 ### a. A simple rule-based approach (regex or keyword expansion).

In [None]:
# Function to extract entities using the dictionary
def dictionary_lookup(text, knowledge_base):
    extracted_entities = {
        "competitors": [],
        "features": [],
        "pricing_keywords": []
    }
    for category, keywords in knowledge_base.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                extracted_entities[category].append(keyword)
    return extracted_entities

# Apply dictionary lookup to the dataset
df["dictionary_entities"] = df["text_snippet"].apply(lambda x: dictionary_lookup(x, domain_knowledge))
print(df[["text_snippet", "dictionary_entities"]].head())


                                        text_snippet  \
0  We need more information about the AI engine b...   
1  We love the analytics, but CompetitorX has a c...   
2  We need more information about the AI engine b...   
3  Can you offer any discounts on your current pr...   
4  Our compliance team is worried about data hand...   

                                 dictionary_entities  
0  {'competitors': [], 'features': ['AI engine'],...  
1  {'competitors': ['CompetitorX'], 'features': [...  
2  {'competitors': [], 'features': ['AI engine'],...  
3  {'competitors': [], 'features': [], 'pricing_k...  
4  {'competitors': [], 'features': [], 'pricing_k...  


In [None]:
import re

def regex_lookup(text, knowledge_base):
    extracted_entities = {
        "competitors": [],
        "features": [],
        "pricing_keywords": []
    }
    for category, keywords in knowledge_base.items():
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword) + r'\b'  # Match exact whole word
            if re.search(pattern, text, re.IGNORECASE):
                extracted_entities[category].append(keyword)
    return extracted_entities

# Apply regex lookup to the dataset
df["regex_entities"] = df["text_snippet"].apply(lambda x: regex_lookup(x, domain_knowledge))
print(df[["text_snippet", "regex_entities"]].head())


                                        text_snippet  \
0  We need more information about the AI engine b...   
1  We love the analytics, but CompetitorX has a c...   
2  We need more information about the AI engine b...   
3  Can you offer any discounts on your current pr...   
4  Our compliance team is worried about data hand...   

                                      regex_entities  
0  {'competitors': [], 'features': ['AI engine'],...  
1  {'competitors': ['CompetitorX'], 'features': [...  
2  {'competitors': [], 'features': ['AI engine'],...  
3  {'competitors': [], 'features': [], 'pricing_k...  
4  {'competitors': [], 'features': [], 'pricing_k...  


### b.  A pre-trained NER model : SPACY to extract entities.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities
def ner_extraction(text):
    doc = nlp(text)
    entities = {"ORG": [], "PRODUCT": []}  # Customize based on domain
    for ent in doc.ents:
        if ent.label_ == "ORG":  # Organization names
            entities["ORG"].append(ent.text)
        elif ent.label_ in ["PRODUCT", "NORP"]:  # Product features or related keywords
            entities["PRODUCT"].append(ent.text)
    return entities

# Apply NER extraction to the dataset
df["ner_entities"] = df["text_snippet"].apply(ner_extraction)
print(df[["text_snippet", "ner_entities"]].head())


                                        text_snippet  \
0  We need more information about the AI engine b...   
1  We love the analytics, but CompetitorX has a c...   
2  We need more information about the AI engine b...   
3  Can you offer any discounts on your current pr...   
4  Our compliance team is worried about data hand...   

                     ner_entities  
0  {'ORG': ['AI'], 'PRODUCT': []}  
1      {'ORG': [], 'PRODUCT': []}  
2  {'ORG': ['AI'], 'PRODUCT': []}  
3      {'ORG': [], 'PRODUCT': []}  
4      {'ORG': [], 'PRODUCT': []}  


## 3. Combine both approaches to produce a final set of extracted entities.

In [None]:
# Combine both approaches
def combine_entities(dict_entities, ner_entities):
    combined = dict_entities.copy()
    for category, items in ner_entities.items():
        if category == "ORG":
            combined["competitors"].extend(items)  # Map NER category to domain knowledge
        elif category == "PRODUCT":
            combined["features"].extend(items)
    return {k: list(set(v)) for k, v in combined.items()}  # Remove duplicates

# Apply the combined entity extraction
df["combined_entities"] = df.apply(lambda x: combine_entities(x["dictionary_entities"], x["ner_entities"]), axis=1)
print(df[["text_snippet", "combined_entities"]].head())


                                        text_snippet  \
0  We need more information about the AI engine b...   
1  We love the analytics, but CompetitorX has a c...   
2  We need more information about the AI engine b...   
3  Can you offer any discounts on your current pr...   
4  Our compliance team is worried about data hand...   

                                   combined_entities  
0  {'competitors': ['AI'], 'features': ['AI engin...  
1  {'competitors': ['CompetitorX'], 'features': [...  
2  {'competitors': ['AI'], 'features': ['AI engin...  
3  {'competitors': [], 'features': [], 'pricing_k...  
4  {'competitors': [], 'features': [], 'pricing_k...  


In [None]:
# Example: Assume 'true_entities' is a column containing ground truth
from sklearn.metrics import precision_score, recall_score

# Binarize extracted and true entities for each category
# This part depends on having labeled ground truth data
# Adapt to match available annotations


## 4. Output the extracted entities for each snippet in a structured format.

In [None]:
# Save results to CSV
df[["id", "text_snippet", "combined_entities"]].to_csv("extracted_entities.csv", index=False)
print("Extracted entities saved to extracted_entities.csv.")

print(df[["id", "text_snippet", "combined_entities"]])

Extracted entities saved to extracted_entities.csv.
      id                                       text_snippet  \
0      1  We need more information about the AI engine b...   
1      2  We love the analytics, but CompetitorX has a c...   
2      3  We need more information about the AI engine b...   
3      4  Can you offer any discounts on your current pr...   
4      5  Our compliance team is worried about data hand...   
..   ...                                                ...   
195  196  CompetitorY provides similar features at a low...   
196  197  CompetitorY provides similar features at a low...   
197  198  We love the analytics, but CompetitorX has a c...   
198  199  Our compliance team is worried about data hand...   
199  200  CompetitorY provides similar features at a low...   

                                     combined_entities  
0    {'competitors': ['AI'], 'features': ['AI engin...  
1    {'competitors': ['CompetitorX'], 'features': [...  
2    {'competitors':

# pkl


In [None]:
import pickle


# Save the trained model (classifier)
with open("model.pkl", "wb") as f:
    pickle.dump(classifier, f)
print("Model saved as model.pkl")

# Save the vectorizer (important for text preprocessing)
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
print("Vectorizer saved as vectorizer.pkl")


Model saved as model.pkl
Vectorizer saved as vectorizer.pkl


In [None]:
import os
import sys
import pickle

projectabspathname = os.path.abspath('projectname.pickle')
print(projectabspathname)
projectname = 'GTM_buddy_Task1.ipynb'
projectpickle = open(str(projectabspathname),'wb')
pickle.dump(projectname, projectpickle)
projectpickle.close()

/content/projectname.pickle


In [None]:
from google.colab import files

files.download('/content/projectname.pickle')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>