#Application


In [2]:
import spacy
import pandas as pd
import re
from google.colab import drive
from tabulate import tabulate

# Mount Google Drive
drive.mount('/content/drive')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Define file paths
csv_file_path = "/content/drive/MyDrive/comparison_results1.csv"
business_file_path = '/content/drive/MyDrive/nj_business.json'
review_file_path = '/content/drive/MyDrive/nj_reviews.json'

# Define comparison patterns
positive_patterns = [r'\bbetter than\b', r'\bmore (?:\S+ ){1,5}than\b', r'\bsuperior to\b', r'\bexceeds (?:\S+ ){0,5}in\b', r'\ba better choice than\b', r'\boutperforms\b', r'\ba more favorable option than\b']
negative_patterns = [r'\bworse than\b', r'\bnot as (?:\S+ ){0,5}as\b', r'\bless (?:\S+ ){1,5}than\b', r'\binferior to\b', r'\bfalls short of\b', r'\ba worse option than\b', r'\bnowhere near as (?:\S+ ){0,5}as\b', r'\bunderperforms\b']
prefer_pattern = [r'\bprefer (?:\S+ ){0,5}over\b']

# Function to find nearest noun in a 5-token window
def get_nearest_noun_in_window(sentence, keyword_token):
    window_start = max(0, keyword_token.i - 5)
    window_end = min(len(sentence), keyword_token.i + 6)
    window_tokens = sentence[window_start:window_end]
    subject_a, subject_b = "/", "/"
    for token in reversed(window_tokens[:keyword_token.i - window_start]):
        if token.pos_ in ["NOUN", "PROPN", "PRON"]:
            subject_a = token.text
            break
    for token in window_tokens[keyword_token.i - window_start + 1:]:
        if token.pos_ in ["NOUN", "PROPN", "PRON"]:
            subject_b = token.text
            break
    return subject_a, subject_b

# Extract comparison sentences and save to CSV
def extract_comparison_sentences():
    nj_reviews = pd.read_json(review_file_path, orient='records')
    comparison_results = []
    for _, row in nj_reviews.iterrows():
        review_text = row['text']
        business_id = row['business_id']
        review_id = row['review_id']
        matches = detect_comparison_sentences(review_text)
        for match in matches:
            comparison_results.append({
                'business_id': business_id,
                'review_id': review_id,
                'subject_a': match['subject_a'],
                'subject_b': match['subject_b'],
                'comparison_type': match['comparison_type'],
                'matched_keyword': match['matched_keyword'],
                'text': match['context']
            })
    comparison_df = pd.DataFrame(comparison_results)
    comparison_df['valid_comparison'] = (comparison_df['subject_a'] != "/") & (comparison_df['subject_b'] != "/")
    comparison_df.sort_values(by='valid_comparison', ascending=False, inplace=True)
    comparison_df.drop(columns=['valid_comparison'], inplace=True)
    comparison_df.to_csv(csv_file_path, index=False)
    print(f"Sorted results saved to {csv_file_path}")
    print("First 100 Comparison Sentence Results (sorted and formatted):")
    print(tabulate(comparison_df.head(100), headers='keys', tablefmt='grid'))

# Display basic statistics on comparison extraction results
def display_statistics():
    loaded_comparison_df = pd.read_csv(csv_file_path)
    valid_subjects_df = loaded_comparison_df[
        (loaded_comparison_df['subject_a'] != "/") &
        (loaded_comparison_df['subject_b'] != "/") &
        (loaded_comparison_df['subject_a'].str.len() > 1) &
        (loaded_comparison_df['subject_b'].str.len() > 1) &
        (~loaded_comparison_df['subject_a'].str.match(r'^[%\'\-\.]', na=False)) &
        (~loaded_comparison_df['subject_b'].str.match(r'^[%\'\-\.]', na=False))
    ]
    print("Statistical Summary of Comparison Extraction Results:")
    print("-----------------------------------------------------")
    comparison_type_counts = loaded_comparison_df['comparison_type'].value_counts()
    print("Comparison Type Counts:\n", comparison_type_counts)
    unique_keywords = loaded_comparison_df['matched_keyword'].nunique()
    print(f"\nNumber of Unique Keywords Extracted: {unique_keywords}")
    top_keywords = loaded_comparison_df['matched_keyword'].value_counts().head(10)
    print("\nTop 10 Most Frequent Comparison Keywords:")
    print(top_keywords)
    valid_comparisons = loaded_comparison_df[(loaded_comparison_df['subject_a'] != "/") & (loaded_comparison_df['subject_b'] != "/")].shape[0]
    invalid_comparisons = loaded_comparison_df[(loaded_comparison_df['subject_a'] == "/") | (loaded_comparison_df['subject_b'] == "/")].shape[0]
    print(f"\nValid Comparisons: {valid_comparisons}")
    print(f"Invalid Comparisons: {invalid_comparisons}")
    subject_summary = valid_subjects_df[['subject_a', 'subject_b']].apply(pd.Series.value_counts).fillna(0)
    subject_summary['total_count'] = subject_summary['subject_a'] + subject_summary['subject_b']
    sorted_subject_summary = subject_summary.sort_values(by='total_count', ascending=False)
    print("\nFrequency of Subjects in Comparisons (Top 10):")
    print(sorted_subject_summary[['subject_a', 'subject_b', 'total_count']].head(10))
    print("\nBasic statistics generated successfully.")

# Interactive filtering tool
def comparison_app():
    loaded_comparison_df = pd.read_csv(csv_file_path)
    def display_menu():
        print("\n" + "="*50)
        print("          🔍 Comparison Results Analysis Tool")
        print("="*50)
        print("Options:")
        print("   1️⃣  View all positive comparisons")
        print("   2️⃣  View all negative comparisons")
        print("   3️⃣  Filter by keyword (e.g., 'not as')")
        print("   4️⃣  Filter by subject (e.g., 'pizza')")
        print("   5️⃣  Display a range of rows (e.g., rows 10 to 20)")
        print("   6️⃣  Exit")
        print("="*50)

    def filter_by_comparison_type(df, comparison_type):
        filtered_df = df[df['comparison_type'] == comparison_type]
        print(tabulate(filtered_df[['subject_a', 'subject_b', 'comparison_type', 'matched_keyword', 'text']].head(10), headers='keys', tablefmt='grid'))

    def filter_by_keyword(df, keyword):
        filtered_df = df[df['matched_keyword'].str.contains(keyword, case=False, na=False)]
        print(tabulate(filtered_df[['subject_a', 'subject_b', 'comparison_type', 'matched_keyword', 'text']].head(10), headers='keys', tablefmt='grid'))

    def filter_by_subject(df, subject):
        filtered_df = df[(df['subject_a'].str.contains(subject, case=False, na=False)) | (df['subject_b'].str.contains(subject, case=False, na=False))]
        print(tabulate(filtered_df[['subject_a', 'subject_b', 'comparison_type', 'matched_keyword', 'text']].head(10), headers='keys', tablefmt='grid'))

    def display_row_range(df, start, end):
        if 0 <= start < len(df) and 0 < end <= len(df):
            print(tabulate(df.iloc[start:end][['subject_a', 'subject_b', 'comparison_type', 'matched_keyword', 'text']], headers='keys', tablefmt='grid'))
        else:
            print("The specified range is out of bounds.")

    while True:
        display_menu()
        choice = input("Select an option (1-6): ")
        if choice == '1':
            filter_by_comparison_type(loaded_comparison_df, 'positive')
        elif choice == '2':
            filter_by_comparison_type(loaded_comparison_df, 'negative')
        elif choice == '3':
            keyword = input("Enter the keyword to filter by (e.g., 'not as'): ")
            filter_by_keyword(loaded_comparison_df, keyword)
        elif choice == '4':
            subject = input("Enter the subject to filter by (e.g., 'pizza'): ")
            filter_by_subject(loaded_comparison_df, subject)
        elif choice == '5':
            try:
                start = int(input("Enter the starting row number (e.g., 10): "))
                end = int(input("Enter the ending row number (e.g., 20): "))
                display_row_range(loaded_comparison_df, start, end + 1)
            except ValueError:
                print("Please enter valid integer numbers for row range.")
        elif choice == '6':
            print("Exiting the program. Goodbye! 👋")
            break
        else:
            print("Invalid option, please try again.")

# Main function to control workflow
def main():
    while True:
        print("\nWelcome to the Comparison Analysis Tool!")
        print("Please select an option:")
        print("1️⃣ Extract comparison sentences from reviews")
        print("2️⃣ Display basic statistics on comparison extraction results")
        print("3️⃣ Try the Comparison Results Analysis Tool")
        print("4️⃣ Exit")
        choice = input("Enter the option number: ")

        if choice == '1':
            extract_comparison_sentences()
        elif choice == '2':
            display_statistics()
        elif choice == '3':
            comparison_app()
        elif choice == '4':
            print("Exiting the program. Goodbye! 👋")
            break
        else:
            print("Invalid selection, please try again.")

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Welcome to the Comparison Analysis Tool!
Please select an option:
1️⃣ Extract comparison sentences from reviews
2️⃣ Display basic statistics on comparison extraction results
3️⃣ Try the Comparison Results Analysis Tool
4️⃣ Exit
Enter the option number: 2
Statistical Summary of Comparison Extraction Results:
-----------------------------------------------------
Comparison Type Counts:
 comparison_type
positive    9541
negative    2201
Name: count, dtype: int64

Number of Unique Keywords Extracted: 3297

Top 10 Most Frequent Comparison Keywords:
matched_keyword
better than            4809
worse than              415
not as good as          365
more expensive than     306
Better than             243
more often than         104
superior to              96
more times than          68
more important than      62
more money than          47
Name: count, dtype: int6