In [None]:
#In terminal 
#spacy download ja_core_news_lg
#spacy download en_core_web_md


# Data.csv Description

This document provides a detailed description of the `Data.csv` file, which is part of our research on addressing concerns related to sexual health among cancer survivors. The dataset is structured as follows:

## Columns Overview

- **Q**: Questions regarding sexual health concerns generated based on epidemiological surveys among cancer survivors.
- **Bot1**: Responses generated by the GPT-3 model.
- **Bot2**: Responses from DocsBot, a GPT-based generative AI, adhering to two Clinical Guidelines.
- **Q_key_E**: Keyword category of the question. Categories include 'sexual functioning', 'sexual response', 'body image', 'intimacy', and 'others'. These were determined by querying GPT to classify the question into these categories.
- **Bot1_Pharma**: Indicates whether the response from Bot1 is related to pharmacological aspects. A value of '1' denotes a pharmaceutical-related response, while '0' indicates otherwise. This was determined by querying GPT-3.5 to classify the response.
- **Bot2_Pharma**: Similar to Bot1_Pharma, but for responses generated by Bot2.
- **Bot1_Consult**: Indicates whether the response from Bot1 is related to consultation aspects. A value of '1' denotes a consultation-related response, while '0' indicates otherwise. This was also determined by querying GPT-3.5.
- **Bot2_Consult**: Similar to Bot1_Consult, but for responses generated by Bot2.

## Bot2 were generated using advanced AI models DocsBot

-https://docsbot.ai/


In [58]:
import pandas as pd
from textblob import TextBlob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_md')

# Get English stop words
stop_words =spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.text not in stop_words])

# Read the excel file
df = pd.read_csv('./data.csv' ,encoding='shift_jis')

# Preprocess the 'Bot1' and 'Bot2' columns
df['Bot1_processed'] = df['Bot1'].apply(preprocess)
df['Bot2_processed'] = df['Bot2'].apply(preprocess)

# Calculate similarity
df['similarity'] = df.apply(lambda row: nlp(row['Bot1_processed']).similarity(nlp(row['Bot2_processed'])), axis=1)

mean = df['similarity'].mean()
median = df['similarity'].median()
max_value = df['similarity'].max()
min_value = df['similarity'].min()
std_dev = df['similarity'].std()

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Max: {max_value}")
print(f"Min: {min_value}")
print(f"Standard Deviation: {std_dev}")

Mean: 0.9263853625351025
Median: 0.9382914894312326
Max: 0.9849092104231264
Min: 0.7707879489963919
Standard Deviation: 0.04182552851013595


In [59]:
# Sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity


df['Bot1s'] = df['Bot1'].apply(get_sentiment)
df['Bot2s'] = df['Bot2'].apply(get_sentiment)

In [60]:
import numpy as np
from scipy.stats import fisher_exact

filtered_df = df[df['Q_key_E'].isin(['Sexual Response', 'Sexual Functioning'])]
filtered_df.set_index('Q_key_E', inplace=True)
frequency = filtered_df.mean() * 100

  frequency = filtered_df.mean() * 100


In [61]:
df = filtered_df
# Calculate the occurrence rate of 1s for each column
frequency_bot1_pharma = df['Bot1_Pharma'].mean()
frequency_bot1_consult = df['Bot1_Consult'].mean()

# Calculate Odds Ratio and P-value
# Create a 2x2 contingency table
table = [
    [df['Bot1_Consult'].sum(), len(df) - df['Bot1_Consult'].sum()],[df['Bot1_Pharma'].sum(), len(df) - df['Bot1_Pharma'].sum()]
]

odds_ratio, p_value = fisher_exact(table)

# Calculate the 95% confidence interval
ci_lower = np.exp(np.log(odds_ratio) - 1.96 * np.sqrt(1/table[0][0] + 1/table[0][1] + 1/table[1][0] + 1/table[1][1]))
ci_upper = np.exp(np.log(odds_ratio) + 1.96 * np.sqrt(1/table[0][0] + 1/table[0][1] + 1/table[1][0] + 1/table[1][1]))

print(f"Occurrence rate of Bot1_Pharma: {frequency_bot1_pharma}")
print(f"Occurrence rate of Bot1_Consult: {frequency_bot1_consult}")
print(f"Odds Ratio: {odds_ratio}")
print(f"95% CI: ({ci_lower}, {ci_upper})")
print(f"P-value: {p_value}")

Occurrence rate of Bot1_Pharma: 0.05405405405405406
Occurrence rate of Bot1_Consult: 0.7297297297297297
Odds Ratio: 47.25
95% CI: (9.548556254497212, 233.81152506155055)
P-value: 1.51320909124418e-09


In [62]:
# Calculate the occurrence rate of 1s for each column
frequency_bot2_pharma = df['Bot2_Pharma'].mean()
frequency_bot2_consult = df['Bot2_Consult'].mean()

# Calculate Odds Ratio and P-value
# Create a 2x2 contingency table
table = [
    [df['Bot2_Consult'].sum(), len(df) - df['Bot2_Consult'].sum()],[df['Bot2_Pharma'].sum(), len(df) - df['Bot2_Pharma'].sum()]
]

odds_ratio, p_value = fisher_exact(table)

# Calculate the 95% confidence interval
ci_lower = np.exp(np.log(odds_ratio) - 1.96 * np.sqrt(1/table[0][0] + 1/table[0][1] + 1/table[1][0] + 1/table[1][1]))
ci_upper = np.exp(np.log(odds_ratio) + 1.96 * np.sqrt(1/table[0][0] + 1/table[0][1] + 1/table[1][0] + 1/table[1][1]))

print(f"Occurrence rate of Bot2_Pharma: {frequency_bot2_pharma}")
print(f"Occurrence rate of Bot2_Consult: {frequency_bot2_consult}")
print(f"Odds Ratio: {odds_ratio}")
print(f"95% CI: ({ci_lower}, {ci_upper})")
print(f"P-value: {p_value}")

Occurrence rate of Bot2_Pharma: 0.02702702702702703
Occurrence rate of Bot2_Consult: 0.7297297297297297
Odds Ratio: 97.2
95% CI: (11.721322385274918, 806.0387462654369)
P-value: 1.319941709649264e-10
