In [2]:
import spacy
from spacy.matcher import Matcher
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

nlp = spacy.load("en_core_web_sm")

def is_chi_squared_test_question(preprocessed_text):
    # Use Spacy for rule-based matching
    doc = nlp(preprocessed_text)

    # Rule-based matching for chi-squared test and associated terms
    chi_squared_patterns = [
      [{"LOWER": "chi"}, {"LOWER": "squared"}],
      [{"LOWER": "association"}],
      [{"LOWER": "chi-square"}],
      [{"LOWER": "chisquare"}],
      [{"LOWER": "observed"}],
      [{"LOWER": "expected"}],
      [{"LOWER": "poisson"}],
      [{"LOWER": "fit"}],
      [{"LOWER": "independent"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "pearson's"}, {"LOWER": "chi-squared"}],
      [{"LOWER": "pearson"}, {"LOWER": "chi-square"}],
      [{"LOWER": "contingency"}],
      [{"LOWER": "goodness-of-fit"}],
      [{"LOWER": "independence"}],
      [{"LOWER": "categorical"}],
      [{"LOWER": "χ-square"}],
      [{"LOWER": "chi"}],
      [{"LOWER": "chi2-square"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2-square"}],
      [{"LOWER": "χ-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "pearson's"}],
      [{"LOWER": "pearson"}],
      [{"LOWER": "contingency"}, {"LOWER": "analysis"}],
      [{"LOWER": "goodness"}, {"LOWER": "of"}, {"LOWER": "fit"}, {"LOWER": "analysis"}],
      [{"LOWER": "independence"}, {"LOWER": "analysis"}],
      [{"LOWER": "association"}, {"LOWER": "analysis"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "categorical"}, {"LOWER": "variable"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "region"}],
      [{"LOWER": "categorical"}, {"LOWER": "relationship"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "χ-square"}, {"LOWER": "of"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "χ2"}, {"LOWER": "contingency"}],
      [{"LOWER": "discrete"}],
      [{"LOWER": "chi-squared"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistic"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "value"}],
      [{"LOWER": "χ-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "chi-sq"}],
      [{"LOWER": "χ2"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "χ-square"}, {"LOWER": "of"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi2"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "χ2"}, {"LOWER": "contingency"}],
      [{"LOWER": "discrete"}],
      [{"LOWER": "chi-squared"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistic"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "value"}],
      [{"LOWER": "χ-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "χ2"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "independence"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "contingency"}],
      [{"LOWER": "chi-square"}, {"LOWER": "distribution"}],
      [{"LOWER": "categorical"}, {"LOWER": "analysis"}],
      [{"LOWER": "crosstab"}],
      [{"LOWER": "chi-square"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "categorical"}, {"LOWER": "variable"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "p-value"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "region"}],
      [{"LOWER": "categorical"}, {"LOWER": "relationship"}],
      [{"LOWER": "chi-square"}, {"LOWER": "significance"}],
      [{"LOWER": "chi-square"}, {"LOWER": "critical"}, {"LOWER": "values"}],
      [{"LOWER": "chi-square"}, {"LOWER": "degrees"}, {"LOWER": "freedom"}],
      [{"LOWER": "chi-square"}, {"LOWER": "independence"}],
      [{"LOWER": "contingency"}],
      [{"LOWER": "categorical"}, {"LOWER": "association"}],
      [{"LOWER": "chi-square"}, {"LOWER": "null"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "alternative"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-square"}, {"LOWER": "hypothesis"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "goodness-of-fit"}],
      [{"LOWER": "chi-square"}, {"LOWER": "contingency"}],
      [{"LOWER": "categorical"}],
      [{"LOWER": "chi-square"}, {"LOWER": "statistical"}],
      [{"LOWER": "pearson"}, {"LOWER": "chi-squared"}, {"LOWER": "statistic"}],
      [{"LOWER": "contingency"}, {"LOWER": "independence"}],
      [{"LOWER": "χ2-square"}],
      [{"LOWER": "association"}, {"LOWER": "chi-square"}],
      [{"LOWER": "categorical"}, {"LOWER": "independence"}],
      [{"LOWER": "χ2"}, {"LOWER": "for"}, {"LOWER": "homogeneity"}],
      [{"LOWER": "chi-square"}, {"LOWER": "analysis"}],
      [{"LOWER": "chi-squared"}, {"LOWER": "for"}, {"LOWER": "categorical"}, {"LOWER": "variables"}],
      [{"LOWER": "χ-square"}, {"LOWER": "for"}, {"LOWER": "non-parametric"}],
      [{"LOWER": "chi-square"}, {"LOWER": "for"}, {"LOWER": "observed"}, {"LOWER": "and"}, {"LOWER": "expected"}],
    ]


    chi_squared_matcher = Matcher(nlp.vocab)
    chi_squared_matcher.add("ChiSquaredTest", chi_squared_patterns)

    chi_squared_matches = chi_squared_matcher(doc)

    # Rule-based matching for other tests
    t_test_pattern = [{"LOWER": "t-test"}]
    anova_pattern = [{"LOWER": "anova"}]
    regression_pattern = [{"LOWER": "regression"}, {"LOWER": "analysis"}]
    correlation_pattern = [{"LOWER": "correlation"}, {"LOWER": "analysis"}]
    z_test_pattern = [{"LOWER": "z-test"}]
    f_test_pattern = [{"LOWER": "f-test"}, {"LOWER": "analysis"}, {"LOWER": "variance"}]
    wilcoxon_pattern = [{"LOWER": "wilcoxon"}, {"LOWER": "rank-sum"}, {"LOWER": "test"}]
    mann_whitney_pattern = [{"LOWER": "mann-whitney"}, {"LOWER": "u"}, {"LOWER": "test"}]
    kruskal_wallis_pattern = [{"LOWER": "kruskal-wallis"}, {"LOWER": "test"}]
    paired_t_test_pattern = [{"LOWER": "paired"}, {"LOWER": "t-test"}]
    mcnemar_pattern = [{"LOWER": "mcnemar"}, {"LOWER": "test"}]
    logistic_regression_pattern = [{"LOWER": "logistic"}, {"LOWER": "regression"}]
    cox_proportional_hazards_pattern = [{"LOWER": "cox"}, {"LOWER": "proportional-hazards"}, {"LOWER": "model"}]
    mantel_haenszel_pattern = [{"LOWER": "mantel-haenszel"}, {"LOWER": "test"}]
    kolmogorov_smirnov_pattern = [{"LOWER": "kolmogorov-smirnov"}, {"LOWER": "test"}]

    other_tests_matcher = Matcher(nlp.vocab)
    other_tests_matcher.add("OtherTests", [
        t_test_pattern, anova_pattern, regression_pattern, correlation_pattern,
        z_test_pattern, f_test_pattern, wilcoxon_pattern, mann_whitney_pattern,
        kruskal_wallis_pattern, paired_t_test_pattern, mcnemar_pattern,
        logistic_regression_pattern, cox_proportional_hazards_pattern,
        mantel_haenszel_pattern, kolmogorov_smirnov_pattern
    ])

    other_tests_matches = other_tests_matcher(doc)

    # Check if there are no matches for other tests and there are chi-squared test matches
    return len(other_tests_matches) == 0 and len(chi_squared_matches) > 0

question = "Are gender and education level dependent at a 5% level of significance? In other words, given the data collected above, is there a relationship between the gender of an individual and the level of education that they have obtained?"

result = is_chi_squared_test_question(question)

if result:
    print("The question is related to the chi-squared test and not any other test.")
else:
    print("The question is not related to the chi-squared test or may be related to other tests.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The question is not related to the chi-squared test or may be related to other tests.
