In [17]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pickle

# Initialize and load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def get_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(1)
    return embeddings.cpu().numpy()

# Load the dataset
dataset_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\datasets\agri.csv'
data = pd.read_csv(dataset_path)

# Generate embeddings
question_embeddings = {}
for index, row in tqdm(data.iterrows(), total=data.shape[0], desc="Generating Embeddings"):
    question_embeddings[row['question']] = get_embedding(row['question'])

# Save embeddings and questions to disk
save_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\Saved_state\embeddings.pkl'
with open(save_path, 'wb') as f:
    pickle.dump((question_embeddings, data), f)

print("Preprocessing complete and state saved.")


Generating Embeddings: 100%|█████████████████████████████████████████████████████| 22615/22615 [15:58<00:00, 23.59it/s]

Preprocessing complete and state saved.





In [21]:
import torch
from transformers import AutoTokenizer, AutoModel
from googletrans import Translator
from sklearn.metrics.pairwise import cosine_similarity
from gtts import gTTS
import playsound
import pickle

# Load saved embeddings and dataset
load_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\Saved_state\embeddings.pkl'
with open(load_path, 'rb') as f:
    question_embeddings, data = pickle.load(f)

# Load the multilingual model for embedding generation
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Function to convert sentences to embeddings
def get_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(1)
    return embeddings

translator = Translator()

def translate_to_english(text, src_lang):
    if src_lang != 'en':
        result = translator.translate(text, src='auto', dest='en')
        return result.text
    return text

def find_closest_question(query, src_lang):
    query_eng = translate_to_english(query, src_lang)
    query_emb = get_embedding(query_eng)  # Correctly generating embeddings
    similarities = {q: cosine_similarity(query_emb, emb).flatten()[0] for q, emb in question_embeddings.items()}
    closest_question = max(similarities, key=similarities.get)
    return closest_question, data[data['question'] == closest_question]['answers'].iloc[0]

def text_to_speech(text, lang):
    tts = gTTS(text=text, lang=lang)
    filename = 'D:/YEAR 4/SEM 7/NLP/LAB/PROJECT/codesspeech.mp3'
    tts.save(filename)
    playsound.playsound(filename)

def chatbot_query():
    while True:
        question = input("Enter your agricultural question (type 'exit' to quit): ")
        if question.lower() == 'exit':
            print("Exiting the chatbot.")
            break
        src_lang = input("Enter the language code (en, ml, te, kn, hi): ")
        closest_question, answer = find_closest_question(question, src_lang)
        print("Question:", closest_question)
        print("Answer:", answer)
        text_to_speech(answer, src_lang)

# Start the chatbot
if __name__ == "__main__":
    chatbot_query()


Enter your agricultural question (type 'exit' to quit):  എന്താണ് വിള ഭ്രമണം
Enter the language code (en, ml, te, kn, hi):  ml


Question: what is crop rotation
Answer: Crop rotation is the practice of growing a series of different crops in the same area over several seasons


Enter your agricultural question (type 'exit' to quit):  exit


Exiting the chatbot.


In [7]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def load_interaction_logs(log_path):
    """
    Load the chatbot interaction logs from a CSV file.
    """
    return pd.read_csv(log_path)

def calculate_language_accuracy(log_df):
    """
    Calculate the accuracy of the language detection.
    This function assumes that the correct languages are logged or that you have a way to verify the accuracy post hoc.
    """
    # Assuming 'Detected Language' and 'Actual Language' columns exist
    if 'Actual Language' in log_df.columns:
        accuracy = accuracy_score(log_df['Actual Language'], log_df['Detected Language'])
        return accuracy
    else:
        print("Actual Language data is not available.")
        return None

def answer_relevance_metrics(log_df):
    """
    Placeholder function to calculate metrics related to the relevance of answers.
    This could involve manual tagging of data or automated feedback from users.
    """
    if 'Relevance Score' in log_df.columns:
        average_relevance = log_df['Relevance Score'].mean()
        return average_relevance
    else:
        print("Relevance scores are not logged.")
        return None

def print_classification_report(log_df):
    """
    Print the classification report for language detection.
    """
    if 'Actual Language' in log_df.columns:
        print(classification_report(log_df['Actual Language'], log_df['Detected Language']))
    else:
        print("Insufficient data to generate report.")

def main():
    log_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\conversation_logs.csv'
    log_df = load_interaction_logs(log_path)
    
    # Calculate and print language detection accuracy
    lang_accuracy = calculate_language_accuracy(log_df)
    if lang_accuracy is not None:
        print(f"Language Detection Accuracy: {lang_accuracy:.2f}")
    
    # Calculate and print answer relevance metrics
    relevance = answer_relevance_metrics(log_df)
    if relevance is not None:
        print(f"Average Relevance Score: {relevance:.2f}")
    
    # Print classification report for language detection
    print_classification_report(log_df)

if __name__ == '__main__':
    main()


Actual Language data is not available.
Relevance scores are not logged.
Insufficient data to generate report.


In [3]:
import pandas as pd

def load_and_check_csv(csv_file_path):
    try:
        # Load the CSV file
        df = pd.read_csv(csv_file_path)
        
        # Print general information about the dataframe
        print("DataFrame Info:")
        df.info()

        # Display the first few rows of the dataframe
        print("\nFirst 5 Rows:")
        print(df.head())
        
        # Check the availability of essential columns
        necessary_columns = ['Question', 'Answer', 'Detected Language', 'Actual Language', 'Relevance Score']
        existing_columns = df.columns.tolist()
        print("\nNecessary Columns Check:")
        for column in necessary_columns:
            print(f"{column}: {'Present' if column in existing_columns else 'Absent'}")
        
        # Summarize the availability of data for key metrics
        if 'Actual Language' in df.columns and 'Detected Language' in df.columns:
            print("\nEntries with Actual Language Labels:", df['Actual Language'].notnull().sum())
        if 'Relevance Score' in df.columns:
            print("Entries with Relevance Scores:", df['Relevance Score'].notnull().sum())

    except Exception as e:
        print(f"An error occurred: {e}")

# Assuming the path to your CSV
csv_file_path = 'D:/YEAR 4/SEM 7/NLP/LAB/PROJECT/codes/conversation_logs.csv'
load_and_check_csv(csv_file_path)


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Question           4 non-null      object
 1   Answer             4 non-null      object
 2   Source Language    4 non-null      object
 3   Question Language  4 non-null      object
 4   Timestamp          4 non-null      object
dtypes: object(5)
memory usage: 292.0+ bytes

First 5 Rows:
                   Question  \
0     what are fertilizers?   
1  രാസവളങ്ങൾ എന്തൊക്കെയാണ്?   
2   എന്താണ് കീട നിയന്ത്രണം?   
3     What is pest control?   

                                              Answer Source Language  \
0  Fertilizers are substances that are added to s...              en   
1  വിളകളുടെ വളർച്ചയും ഗുണനിലവാരവും മെച്ചപ്പെടുത്ത...              ml   
2  ഒരു ജീവനക്കാരുടെ നിയന്ത്രണം അല്ലെങ്കിൽ മാനേജ്മ...              ml   
3  is the regulation or management of a species d..

In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

def load_interaction_logs(log_path):
    """
    Load the chatbot interaction logs from a CSV file.
    """
    return pd.read_csv(log_path)

def calculate_language_accuracy(log_df):
    """
    Calculate the accuracy of the language detection.
    Assumes that 'Detected Language' and 'Actual Language' columns are correctly logged.
    """
    if 'Actual Language' in log_df.columns and 'Detected Language' in log_df.columns:
        accuracy = accuracy_score(log_df['Actual Language'], log_df['Detected Language'])
        return accuracy
    else:
        print("Actual Language data or Detected Language data is not available.")
        return None

def answer_relevance_metrics(log_df):
    """
    Calculate metrics related to the relevance of answers.
    Assumes that a 'Relevance Score' column is present and properly logged.
    """
    if 'Relevance Score' in log_df.columns:
        average_relevance = log_df['Relevance Score'].mean()
        return average_relevance
    else:
        print("Relevance scores are not logged.")
        return None

def print_classification_report(log_df):
    """
    Print the classification report for language detection.
    Assumes that 'Actual Language' and 'Detected Language' columns are correctly logged.
    """
    if 'Actual Language' in log_df.columns and 'Detected Language' in log_df.columns:
        report = classification_report(log_df['Actual Language'], log_df['Detected Language'])
        print(report)
    else:
        print("Insufficient data to generate a classification report.")

def main():
    log_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\conversation_logs.csv'
    log_df = load_interaction_logs(log_path)
    
    # Calculate and print language detection accuracy
    lang_accuracy = calculate_language_accuracy(log_df)
    if lang_accuracy is not None:
        print(f"Language Detection Accuracy: {lang_accuracy:.2f}")
    else:
        print("Unable to calculate language detection accuracy due to missing data.")
    
    # Calculate and print answer relevance metrics
    relevance = answer_relevance_metrics(log_df)
    if relevance is not None:
        print(f"Average Relevance Score: {relevance:.2f}")
    else:
        print("Unable to calculate answer relevance due to missing data.")
    
    # Print classification report for language detection
    print_classification_report(log_df)

if __name__ == '__main__':
    main()


Actual Language data or Detected Language data is not available.
Unable to calculate language detection accuracy due to missing data.
Relevance scores are not logged.
Unable to calculate answer relevance due to missing data.
Insufficient data to generate a classification report.


In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

def load_data(csv_file_path):
    """Load the chatbot interaction logs from a CSV file."""
    return pd.read_csv(csv_file_path)

def calculate_language_accuracy(log_df):
    """Calculate the accuracy of language detection."""
    if 'Actual Language' in log_df.columns and 'Detected Language' in log_df.columns:
        return accuracy_score(log_df['Actual Language'], log_df['Detected Language'])
    else:
        return None

def evaluate_answer_quality(log_df):
    """Evaluate the average relevance score of the answers provided by the chatbot."""
    if 'Relevance Score' in log_df.columns:
        return log_df['Relevance Score'].mean()
    else:
        return None

def calculate_error_rate(log_df):
    """Calculate the error rate based on correct outputs logged."""
    if 'Correct Output' in log_df.columns:
        return 1 - (log_df['Correct Output'].astype(bool).mean())
    else:
        return None

def print_detailed_report(log_df):
    """Print a detailed classification report for language detection."""
    if 'Actual Language' in log_df.columns and 'Detected Language' in log_df.columns:
        print(classification_report(log_df['Actual Language'], log_df['Detected Language']))

def main():
    csv_file_path = r'D:\YEAR 4\SEM 7\NLP\LAB\PROJECT\codes\conversation_logs.csv'
    log_df = load_data(csv_file_path)
    
    language_accuracy = calculate_language_accuracy(log_df)
    answer_quality = evaluate_answer_quality(log_df)
    error_rate = calculate_error_rate(log_df)

    print(f"Language Detection Accuracy: {language_accuracy:.2f}" if language_accuracy is not None else "Language detection data unavailable.")
    print(f"Average Answer Relevance Score: {answer_quality:.2f}" if answer_quality is not None else "Answer relevance scores unavailable.")
    print(f"Error Rate: {error_rate:.2f}" if error_rate is not None else "Error data unavailable.")
    
    print_detailed_report(log_df)

if __name__ == '__main__':
    main()


Language Detection Accuracy: 1.00
Average Answer Relevance Score: 5.00
Error Rate: 0.00
              precision    recall  f1-score   support

          en       1.00      1.00      1.00        12
          ml       1.00      1.00      1.00         6
          te       1.00      1.00      1.00        11

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29

