In [4]:
import pandas as pd
from rapidfuzz import fuzz, process
from jiwer import wer
from rapidfuzz.distance import Levenshtein

Similarity and Accuracy index for first Transcribe sentences

In [5]:
# Load the corrected and transcribed datasets
corrected_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - dementia.csv"
corrected_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - NonDementia.csv"

transcribed_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Dementia_Transcriptions.csv"
transcribed_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\NonDementia_Transcriptions.csv"

# Load datasets into DataFrames
corrected_dementia_df = pd.read_csv(corrected_dementia_path)
corrected_non_dementia_df = pd.read_csv(corrected_non_dementia_path)

transcribed_dementia_df = pd.read_csv(transcribed_dementia_path)
transcribed_non_dementia_df = pd.read_csv(transcribed_non_dementia_path)

# Combine corrected and transcribed datasets for comparison
corrected_df = pd.concat([corrected_dementia_df, corrected_non_dementia_df]).reset_index(drop=True)
transcribed_df = pd.concat([transcribed_dementia_df, transcribed_non_dementia_df]).reset_index(drop=True)

# Ensure the datasets are aligned
if len(corrected_df) != len(transcribed_df):
    print("Warning: The datasets have different numbers of sentences!")
else:
    print("Datasets are aligned for comparison.")

# Compare sentences
results = []
for index, row in corrected_df.iterrows():
    corrected_sentence = row['sentences']  # Assuming 'sentences' column in corrected_df
    transcribed_sentence = transcribed_df.loc[index, 'transcribed_text']  # 'transcribed_text' in transcribed_df

    # Calculate metrics
    exact_match = int(corrected_sentence.strip() == transcribed_sentence.strip())
    similarity_score = fuzz.ratio(corrected_sentence, transcribed_sentence)
    edit_distance = Levenshtein.distance(corrected_sentence, transcribed_sentence)  # Updated line
    sentence_wer = wer(corrected_sentence, transcribed_sentence)


    # Store the results
    results.append({
        'Corrected Sentence': corrected_sentence,
        'Transcribed Sentence': transcribed_sentence,
        'Exact Match': exact_match,
        'Similarity Score (%)': similarity_score,
        'Edit Distance': edit_distance,
        'Word Error Rate (WER)': sentence_wer,
    })

# Create a results DataFrame
results_df = pd.DataFrame(results)

# Save the results
results_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Comparison_Results.csv"
results_df.to_csv(results_path, index=False, encoding='utf-8-sig')
print(f"Comparison results saved to: {results_path}")

# Calculate overall metrics
total_sentences = len(results_df)
exact_match_accuracy = results_df['Exact Match'].mean() * 100
average_similarity = results_df['Similarity Score (%)'].mean()
average_wer = results_df['Word Error Rate (WER)'].mean()

# Display overall metrics
print(f"Total Sentences: {total_sentences}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}%")
print(f"Average Similarity Score: {average_similarity:.2f}%")
print(f"Average Word Error Rate (WER): {average_wer:.2f}")

# Display top mismatched sentences
mismatched_df = results_df[results_df['Exact Match'] == 0].sort_values(by='Similarity Score (%)', ascending=False)
print("Top 5 Mismatched Sentences:")
print(mismatched_df.head(5))

Datasets are aligned for comparison.
Comparison results saved to: C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Comparison_Results.csv
Total Sentences: 340
Exact Match Accuracy: 6.47%
Average Similarity Score: 86.74%
Average Word Error Rate (WER): 0.46
Top 5 Mismatched Sentences:
                                    Corrected Sentence  \
199  আজকে পুরোনো বন্ধুদের সাথে দেখা হয়েছে, অনেক কথ...   
338  তুমি পড়ালেখার পাশাপাশি ইন্টার্নশিপ করো, কাজে ...   
276    এইবার ছুটিতে বন্ধুদের সাথে পাহাড়ে বেড়াতে যাব।   
312    খুব ব্যস্ত দিন কাটালাম, এখন একটু বিশ্রাম দরকার।   
122       তোমার পোশাকের ধরন আমার খুব একটা ভালো লাগেনি।   

                                  Transcribed Sentence  Exact Match  \
199  আজকে পুরনো বন্ধুদের সাথে দেখা হয়েছে, অনেক কথা...            0   
338  তুমি পড়ালেখার পাশাপাশি ইন্টারনশিপ করো, কাজে আ...            0   
276   এইবার ছুটিতে বন্ধুদের সাথে পাহাড়ে বেড়াতে যাবে।            0   
312     খুব ব্যস্ত দিন কাটালাম এখন একটু বিশ্রাম দরকার।            0   
122      তোমার প

Similarity & Accuracy index for matched transcribe sentences

In [6]:
# Load the corrected and transcribed datasets
corrected_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - dementia.csv"
corrected_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - NonDementia.csv"

transcribed_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Final_Dementia_Transcriptions.csv"
transcribed_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Final_NonDementia_Transcriptions.csv"

# Load datasets into DataFrames
corrected_dementia_df = pd.read_csv(corrected_dementia_path)
corrected_non_dementia_df = pd.read_csv(corrected_non_dementia_path)

transcribed_dementia_df = pd.read_csv(transcribed_dementia_path)
transcribed_non_dementia_df = pd.read_csv(transcribed_non_dementia_path)

# Combine corrected and transcribed datasets for comparison
corrected_df = pd.concat([corrected_dementia_df, corrected_non_dementia_df]).reset_index(drop=True)
transcribed_df = pd.concat([transcribed_dementia_df, transcribed_non_dementia_df]).reset_index(drop=True)

# Ensure the datasets are aligned
if len(corrected_df) != len(transcribed_df):
    print("Warning: The datasets have different numbers of sentences!")
else:
    print("Datasets are aligned for comparison.")

# Compare sentences
results = []
for index, row in corrected_df.iterrows():
    corrected_sentence = row['sentences']  # Assuming 'sentences' column in corrected_df
    transcribed_sentence = transcribed_df.loc[index, 'transcribed_text']  # 'transcribed_text' in transcribed_df

    # Calculate metrics
    exact_match = int(corrected_sentence.strip() == transcribed_sentence.strip())
    similarity_score = fuzz.ratio(corrected_sentence, transcribed_sentence)
    edit_distance = Levenshtein.distance(corrected_sentence, transcribed_sentence)  # Updated line
    sentence_wer = wer(corrected_sentence, transcribed_sentence)


    # Store the results
    results.append({
        'Corrected Sentence': corrected_sentence,
        'Transcribed Sentence': transcribed_sentence,
        'Exact Match': exact_match,
        'Similarity Score (%)': similarity_score,
        'Edit Distance': edit_distance,
        'Word Error Rate (WER)': sentence_wer,
    })

# Create a results DataFrame
results_df = pd.DataFrame(results)

# Save the results
results_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Matched_Comparison_Results.csv"
results_df.to_csv(results_path, index=False, encoding='utf-8-sig')
print(f"Comparison results saved to: {results_path}")

# Calculate overall metrics
total_sentences = len(results_df)
exact_match_accuracy = results_df['Exact Match'].mean() * 100
average_similarity = results_df['Similarity Score (%)'].mean()
average_wer = results_df['Word Error Rate (WER)'].mean()

# Display overall metrics
print(f"Total Sentences: {total_sentences}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}%")
print(f"Average Similarity Score: {average_similarity:.2f}%")
print(f"Average Word Error Rate (WER): {average_wer:.2f}")

# Display top mismatched sentences
mismatched_df = results_df[results_df['Exact Match'] == 0].sort_values(by='Similarity Score (%)', ascending=False)
print("Top 5 Mismatched Sentences:")
print(mismatched_df.head(5))# Load the corrected and transcribed datasets
corrected_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - dementia.csv"
corrected_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\CorrectSen - NonDementia.csv"

transcribed_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Dementia_Transcriptions.csv"
transcribed_non_dementia_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\NonDementia_Transcriptions.csv"

# Load datasets into DataFrames
corrected_dementia_df = pd.read_csv(corrected_dementia_path)
corrected_non_dementia_df = pd.read_csv(corrected_non_dementia_path)

transcribed_dementia_df = pd.read_csv(transcribed_dementia_path)
transcribed_non_dementia_df = pd.read_csv(transcribed_non_dementia_path)

# Combine corrected and transcribed datasets for comparison
corrected_df = pd.concat([corrected_dementia_df, corrected_non_dementia_df]).reset_index(drop=True)
transcribed_df = pd.concat([transcribed_dementia_df, transcribed_non_dementia_df]).reset_index(drop=True)

# Ensure the datasets are aligned
if len(corrected_df) != len(transcribed_df):
    print("Warning: The datasets have different numbers of sentences!")
else:
    print("Datasets are aligned for comparison.")

# Compare sentences
results = []
for index, row in corrected_df.iterrows():
    corrected_sentence = row['sentences']  # Assuming 'sentences' column in corrected_df
    transcribed_sentence = transcribed_df.loc[index, 'transcribed_text']  # 'transcribed_text' in transcribed_df

    # Calculate metrics
    exact_match = int(corrected_sentence.strip() == transcribed_sentence.strip())
    similarity_score = fuzz.ratio(corrected_sentence, transcribed_sentence)
    edit_distance = Levenshtein.distance(corrected_sentence, transcribed_sentence)  # Updated line
    sentence_wer = wer(corrected_sentence, transcribed_sentence)


    # Store the results
    results.append({
        'Corrected Sentence': corrected_sentence,
        'Transcribed Sentence': transcribed_sentence,
        'Exact Match': exact_match,
        'Similarity Score (%)': similarity_score,
        'Edit Distance': edit_distance,
        'Word Error Rate (WER)': sentence_wer,
    })

# Create a results DataFrame
results_df = pd.DataFrame(results)

# Save the results
results_path = r"C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Comparison_Results.csv"
results_df.to_csv(results_path, index=False, encoding='utf-8-sig')
print(f"Comparison results saved to: {results_path}")

# Calculate overall metrics
total_sentences = len(results_df)
exact_match_accuracy = results_df['Exact Match'].mean() * 100
average_similarity = results_df['Similarity Score (%)'].mean()
average_wer = results_df['Word Error Rate (WER)'].mean()

# Display overall metrics
print(f"Total Sentences: {total_sentences}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}%")
print(f"Average Similarity Score: {average_similarity:.2f}%")
print(f"Average Word Error Rate (WER): {average_wer:.2f}")

# Display top mismatched sentences
mismatched_df = results_df[results_df['Exact Match'] == 0].sort_values(by='Similarity Score (%)', ascending=False)
print("Top 5 Mismatched Sentences:")
print(mismatched_df.head(5))

Datasets are aligned for comparison.
Comparison results saved to: C:\Users\AsifAK\Desktop\Code_Detect_Dementia\Matched_Comparison_Results.csv
Total Sentences: 340
Exact Match Accuracy: 45.29%
Average Similarity Score: 98.10%
Average Word Error Rate (WER): 0.10
Top 5 Mismatched Sentences:
                                    Corrected Sentence  \
327  শুনলাম এ বারের বই মেলায় থিলার জনরার বেশ কিছু ব...   
323  শুক্রবার দুপুরে পরিবারের সবাই মিলে বাইরে খেতে ...   
203  বিকেলে বাসায় কিছু মেহমান আসবে, তাদের জন্য রান...   
105  সামনে পহেলা বৈশাখে আমরা বাসার সবাই পাঞ্জাবি আর...   
108  আমার বড় বোন ঈদে একটা শাড়ি চেয়েছিল, কিনতে ভু...   

                                  Transcribed Sentence  Exact Match  \
327  শুন্লাম এ বারের বই মেলায় থিলার জনরার বেশ কিছু ...            0   
323  শুক্রবার দুপুরে পরিবারে সবাই মিলে বাইরে খেতে গ...            0   
203  বিকেলে বাসায় কিছু মেহমান আসবে তাদের জন্য রান্...            0   
105  সামনে পহেলা বৈশাখে আমরা বাসার সবাই, পাঞ্জাবি আ...            0   
108  আম

In [10]:
import pandas as pd
import re
from collections import Counter

# Load the dataset
# Replace 'Merged_Dataset.csv' with the actual file path of your dataset
df = pd.read_csv('Merged_Dataset.csv')

# Check if the necessary columns exist
if 'transcribed_text' in df.columns and 'class' in df.columns:
    # Separate the data by class
    dementia_sentences = df[df['class'] == 'Dementia']['transcribed_text'].dropna().tolist()
    non_dementia_sentences = df[df['class'] == 'Non-Dementia']['transcribed_text'].dropna().tolist()

    # Tokenize words for each class
    dementia_words = []
    for sentence in dementia_sentences:
        tokens = re.findall(r'[\u0980-\u09FF]+', sentence)  # Match Bangla words
        dementia_words.extend(tokens)

    non_dementia_words = []
    for sentence in non_dementia_sentences:
        tokens = re.findall(r'[\u0980-\u09FF]+', sentence)  # Match Bangla words
        non_dementia_words.extend(tokens)

    # Count word frequencies for each class
    dementia_word_counts = Counter(dementia_words)
    non_dementia_word_counts = Counter(non_dementia_words)

    # Create a unified DataFrame
    all_words = set(dementia_word_counts.keys()).union(set(non_dementia_word_counts.keys()))
    data = {
        'Word': list(all_words),
        'Dementia_Frequency': [dementia_word_counts.get(word, 0) for word in all_words],
        'Non_Dementia_Frequency': [non_dementia_word_counts.get(word, 0) for word in all_words]
    }

    word_frequencies_df = pd.DataFrame(data).sort_values(by=['Dementia_Frequency', 'Non_Dementia_Frequency'], ascending=False)

    # Save the DataFrame to a CSV file
    output_file = 'word_frequencies_by_class.csv'
    word_frequencies_df.to_csv(output_file, index=False, encoding='utf-8-sig')

    print(f"Word frequencies by class have been saved to '{output_file}' successfully.")
else:
    print("The necessary columns 'transcribed_text' or 'class' are not found in the dataset.")


Word frequencies by class have been saved to 'word_frequencies_by_class.csv' successfully.
