In [None]:
import spacy
from spacy.matcher import Matcher
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

nlp = spacy.load("en_core_web_sm")

def is_chi_squared_test_question(preprocessed_text):
    # Use Spacy for rule-based matching
    doc = nlp(preprocessed_text)

    # Rule-based matching for chi-squared test and associated terms
    chi_squared_patterns = [
      [{"LOWER": "chi"}, {"LOWER": "squared"}],
      [{"LOWER": "association"}],
      [{"LOWER": "chi-square"}],
      [{"LOWER": "chisquare"}],
      [{"LOWER": "observed"}],
      [{"LOWER": "expected"}],
      [{"LOWER": "poisson"}],
      [{"LOWER": "fits"}],
      [{"LOWER": "fit"}],
      [{"LOWER": "independent"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "pearson's"}, {"LOWER": "chi-squared"}],
      [{"LOWER": "pearson"}, {"LOWER": "chi-square"}],
      [{"LOWER": "contingency"}],
      [{"LOWER": "goodness-of-fit"}],
      [{"LOWER": "independence"}],
      [{"LOWER": "categorical"}],
      [{"LOWER": "χ-square"}],
      [{"LOWER": "chi"}],
      [{"LOWER": "chi2-square"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2-square"}],
      [{"LOWER": "χ-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "pearson's"}],
      [{"LOWER": "pearson"}],
      [{"LOWER": "contingency"}, {"LOWER": "analysis"}],
      [{"LOWER": "goodness"}, {"LOWER": "of"}, {"LOWER": "fit"}, {"LOWER": "analysis"}],
      [{"LOWER": "independence"}, {"LOWER": "analysis"}],
      [{"LOWER": "association"}, {"LOWER": "analysis"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "categorical"}, {"LOWER": "variable"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "region"}],
      [{"LOWER": "categorical"}, {"LOWER": "relationship"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "χ-square"}, {"LOWER": "of"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "χ2"}, {"LOWER": "contingency"}],
      [{"LOWER": "discrete"}],
      [{"LOWER": "chi-squared"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistic"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "value"}],
      [{"LOWER": "χ-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "χ-square"}, {"LOWER": "of"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "χ2"}, {"LOWER": "contingency"}],
      [{"LOWER": "discrete"}],
      [{"LOWER": "chi-squared"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistic"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "value"}],
      [{"LOWER": "χ-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "contingency"}],
      [{"LOWER": "chi-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "categorical"}, {"LOWER": "analysis"}],
      [{"LOWER": "crosstab"}],
      [{"LOWER": "chi-square"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "categorical"}, {"LOWER": "variable"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "region"}],
      [{"LOWER": "categorical"}, {"LOWER": "relationship"}],
      [{"LOWER": "chi-square"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "values"}],
      [{"LOWER": "chi-square"}, {"LOWER": "degrees"}, {"LOWER": "freedom"}],
      [{"LOWER": "chi-square"}, {"LOWER": "independence"}],
      [{"LOWER": "contingency"}],
      [{"LOWER": "categorical"}, {"LOWER": "association"}],
      [{"LOWER": "chi-square"}, {"LOWER": "null"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "alternative"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "chi-square"}, {"LOWER": "contingency"}],
      [{"LOWER": "categorical"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistical"}],
      [{"LOWER": "pearson"}, {"LOWER": "chi-squared"}, {"LOWER": "statistic"}],
      [{"LOWER": "contingency"}, {"LOWER": "independence"}],
      [{"LOWER": "χ2-square"}],
      [{"LOWER": "association"}, {"LOWER": "chi-square"}],
      [{"LOWER": "categorical"}, {"LOWER": "independence"}],
      [{"LOWER": "χ2"}, {"LOWER": "for"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "categorical"}, {"LOWER": "variables"}],
      [{"LOWER": "χ-square"}, {"LOWER": "for"}, {"LOWER": "non-parametric"}],
      [{"LOWER": "chi-square"}, {"LOWER": "for"}, {"LOWER": "observed"}, {"LOWER": "and"}, {"LOWER": "expected"}],
    ]


    chi_squared_matcher = Matcher(nlp.vocab)
    chi_squared_matcher.add("ChiSquaredTest", chi_squared_patterns)

    chi_squared_matches = chi_squared_matcher(doc)

    # Rule-based matching for other tests
    t_test_pattern = [{"LOWER": "t-test"}]
    anova_pattern = [{"LOWER": "anova"}]
    regression_pattern = [{"LOWER": "regression"}]
    correlation_pattern = [{"LOWER": "correlation"}]
    z_test_pattern = [{"LOWER": "z-test"}]
    f_test_pattern = [{"LOWER": "f-test"}, {"LOWER": "analysis"}, {"LOWER": "variance"}]
    wilcoxon_pattern = [{"LOWER": "wilcoxon"}, {"LOWER": "rank-sum"}]
    mann_whitney_pattern = [{"LOWER": "mann-whitney"}, {"LOWER": "u"}]
    kruskal_wallis_pattern = [{"LOWER": "kruskal-wallis"}]
    mcnemar_pattern = [{"LOWER": "mcnemar"}]
    logistic_regression_pattern = [{"LOWER": "logistic"}, {"LOWER": "regression"}]
    cox_proportional_hazards_pattern = [{"LOWER": "cox"}, {"LOWER": "proportional-hazards"}]
    mantel_haenszel_pattern = [{"LOWER": "mantel-haenszel"}]
    kolmogorov_smirnov_pattern = [{"LOWER": "kolmogorov-smirnov"}]

    other_tests_matcher = Matcher(nlp.vocab)
    other_tests_matcher.add("OtherTests", [
        t_test_pattern, anova_pattern, regression_pattern, correlation_pattern,
        z_test_pattern, f_test_pattern, wilcoxon_pattern, mann_whitney_pattern,
        kruskal_wallis_pattern, mcnemar_pattern,
        logistic_regression_pattern, cox_proportional_hazards_pattern,
        mantel_haenszel_pattern, kolmogorov_smirnov_pattern
    ])

    other_tests_matches = other_tests_matcher(doc)

    # Check if there are no matches for other tests and there are chi-squared test matches
    return len(other_tests_matches) == 0 and len(chi_squared_matches) > 0

question = "Researchers collected data on the sales of a product and various advertising expenditures. They want to assess how well a linear regression model can predict sales based on advertising spending. Using a sample of 100 observations, they fitted a regression model. What statistical measures or techniques can they use to evaluate the goodness of fit of their regression model?"

result = is_chi_squared_test_question(question)

if result:
    print("The question is related to the chi-squared test and not any other test.")
else:
    print("The question is not related to the chi-squared test or may be related to other tests.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The question is not related to the chi-squared test or may be related to other tests.


# **Test the database**

In [None]:
# Read the Excel file into a pandas DataFrame
file_path = '/content/Dataset.xlsx'
df = pd.read_excel(file_path)

# Add a new column 'ChiSquaredFlag' to the DataFrame
df['ChiSquaredFlag'] = 0  # Initialize all values to 0

# Iterate through the rows and apply the chi-squared test
for index, row in df.iterrows():
    question = row['Question']

    result = is_chi_squared_test_question(question)

    # Update the 'ChiSquaredFlag' column based on the result
    df.at[index, 'ChiSquaredFlag'] = 1 if result else 0

    # Print the 'ChiSquaredFlag' column value for the current row
    print(f"Row {index + 2}: ChiSquaredFlag = {df.at[index, 'ChiSquaredFlag']}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/Dataset.xlsx'

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nlp(text)
    tokens = [token.text for token in tokens if token.text not in STOP_WORDS]
    return tokens

# Check if the question is related to the chi-squared test
def is_chi_squared_test_question(text):
    tokens = preprocess_text(text)
    doc = nlp(" ".join(tokens))

    # Check for relevant patterns in the dependency tree
    for token in doc:
        if token.dep_ == "prep" and "percent" in token.text:
            return True
        elif token.dep_ == "advcl" and "higher" in token.text:
            return True

    # Additional checks
    if "categorical" in tokens or "frequency" in tokens:
        return True

    return False

# Example question
question = "According to Beverage Digest/Maxwell Report, the distribution of market share for the top seven soft drinks in the United States was: Coca-Cola 17.9%, Pepsi-Cola 11.5%, Diet Coke 9.7%, Mountain Dew 6.3%, Diet Pepsi 6.1%, Sprite 5.7%, and Dr. Pepper 5.6%. Suppose a marketing analyst wants to determine whether this distribution fits that of her geographic region. She randomly surveys 1726 people and asks them to name their favorite soft drink. The responses are: Coca-Cola 314, Pepsi 219, Diet Coke 212,Mountain Dew 121, Diet Pepsi 98, Sprite 93, Dr.Pepper 88, and others 581. She then tests to determine whether the local distribution of soft drink preferences is the same or different from the national figures, using a = .05. What does she find?"

# Check if the question is related to chi-squared test
result = is_chi_squared_test_question(question)

if result:
    print("The question is related to the chi-squared test.")
else:
    print("The question is not related to the chi-squared test.")


The question is not related to the chi-squared test.


In [None]:
import re

def detect_chi_square_test_type(text):
    # Patterns indicative of a goodness-of-fit test
    goodness_of_fit_patterns = [
        r'chi-square goodness-of-fit test',
        r'observed frequencies',
        r'expected frequencies'
    ]

    # Patterns indicative of a test of independence
    independence_patterns = [
        r'chi-square test of independence',
        r'contingency table',
        r'independence',
        r'association'
    ]

    # Check for goodness-of-fit patterns
    for pattern in goodness_of_fit_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return 'Goodness-of-Fit Test'

    # Check for test of independence patterns
    for pattern in independence_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return 'Test of Independence'

    # If no clear indication, return None
    return None

# Example usage
text_goodness_of_fit = "Use a chi-square goodness-of-fit test to determine whether the observed frequencies are distributed the same as the expected frequencies. 53,37,32,38 68,42,33,22 4 'Id - 1,2,3,4,5,6 fo - 53,37,32,28,18,15 fe - 68,42,33,22,10,8' g-o-f"
text_independence = "Conduct a chi-square test of independence to assess whether there is a significant association between variable X and variable Y."

# Detect chi-square test type
test_type_goodness_of_fit = detect_chi_square_test_type(text_goodness_of_fit)
test_type_independence = detect_chi_square_test_type(text_independence)

# Display results
print("Test Type - Goodness of Fit:", test_type_goodness_of_fit)
print("Test Type - Independence:", test_type_goodness_of_fit)
