<a href="https://colab.research.google.com/github/Ashu598/conversation_analysis/blob/main/biztel_oops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Implement OOP Principles for Data Pipeline

In [None]:
from math import pi
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

class DataPipeline:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None

    def load_data(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        rows = []
        for conv_id, conv_data in data.items():
            article_url = conv_data.get("article_url", "")
            config = conv_data.get("config", "")
            conversation_rating = conv_data.get("conversation_rating", {})

            for turn in conv_data.get("content", []):
                rows.append({
                    "conversation_id": conv_id,
                    "article_url": article_url,
                    "config": config,
                    "message": turn["message"],
                    "agent": turn["agent"],
                    "sentiment": turn["sentiment"],
                    "knowledge_source": ", ".join(turn["knowledge_source"]),
                    "turn_rating": turn["turn_rating"],
                    "agent_1_rating": conversation_rating.get("agent_1", ""),
                    "agent_2_rating": conversation_rating.get("agent_2", "")
                })

        self.df = pd.DataFrame(rows)
        return self.df

    # Convert Categorical Variables into Numerical Representations
    def convert_numerical_columns(self):
      df['config'] = df['config'].map({"A":1, "B":2, "C":3, "D":4})
      df['agent'] = df['agent'].map({"agent_1":1, "agent_2":2})
      df['sentiment'] = df['sentiment'].map({"Fearful":0, "Angry":1, "Disgusted":2, "Sad":3, "Neutral":4, "Surprised": 5, "Happy":6})
      df['turn_rating'] = df['turn_rating'].map({"Poor":0, "Not Good":1, "Passable":2, "Good":3, "Excellent":4})
      df['agent_1_rating'] = df['agent_1_rating'].map({"Not Good":1, "Passable":2, "Good":3, "Excellent":4})
      df['agent_2_rating'] = df['agent_2_rating'].map({"Not Good":1, "Passable":2, "Good":3, "Excellent":4})
      return self.df


# Initialize Pipeline
pipeline = DataPipeline("/content/BiztelAI_DS_Dataset_Mar'25.json")
df = pipeline.load_data()
df = pipeline.convert_numerical_columns()
#df["cleaned_message"] = df["message"].apply(preprocess_text)
print(df.head())


                          conversation_id  \
0  t_d004c097-424d-45d4-8f91-833d85c2da31   
1  t_d004c097-424d-45d4-8f91-833d85c2da31   
2  t_d004c097-424d-45d4-8f91-833d85c2da31   
3  t_d004c097-424d-45d4-8f91-833d85c2da31   
4  t_d004c097-424d-45d4-8f91-833d85c2da31   

                                         article_url  config  \
0  https://www.washingtonpost.com/sports/colleges...       3   
1  https://www.washingtonpost.com/sports/colleges...       3   
2  https://www.washingtonpost.com/sports/colleges...       3   
3  https://www.washingtonpost.com/sports/colleges...       3   
4  https://www.washingtonpost.com/sports/colleges...       3   

                                             message  agent  sentiment  \
0  Did you know that the University of Iowa's loc...      1        NaN   
1  I think I did hear something about that.  I im...      2        4.0   
2  So, it would be in the visiting team's locker ...      1        NaN   
3  Right.  Teams do all kinds of things to bothe

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def preprocess_text(text):
  text = text.lower()  # Convert to lowercase
  text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove special characters
  words = word_tokenize(text)  # Tokenization
  words = [word for word in words if word not in stopwords.words("english")]  # Remove stopwords
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
  return " ".join(words)

df["cleaned_message"] = df["message"].apply(preprocess_text)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11760 entries, 0 to 11759
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   conversation_id   11760 non-null  object 
 1   article_url       11760 non-null  object 
 2   config            11760 non-null  int64  
 3   message           11760 non-null  object 
 4   agent             11760 non-null  int64  
 5   sentiment         6218 non-null   float64
 6   knowledge_source  11760 non-null  object 
 7   turn_rating       11648 non-null  float64
 8   agent_1_rating    11739 non-null  float64
 9   agent_2_rating    11760 non-null  int64  
 10  cleaned_message   11760 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 1010.8+ KB
