In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import string

In [2]:
math_definitions = [{"Term": "Matrix", 
                     "Definition": "A rectangular array of numbers arranged in rows and columns.", 
                     "Category": "Algebra","Example": "A 2x2 matrix: [[1, 2], [3, 4]]",
                     "Formula": "No specific formula; matrix operations include addition, multiplication, etc.",
                     "Difficulty Level": "Medium",
                     "Related Terms": ["Determinant", "Eigenvalue", "Vector"],
                     "Year Introduced": "1850","Field of Study": "Linear Algebra"
                     },
                    {"Term": "Derivative", 
                     "Definition": "The rate at which a function is changing at any given point.", 
                     "Category": "Calculus",
                     "Example": "The derivative of f(x) = x^2 is f'(x) = 2x.",
                     "Formula": "f'(x) = lim(h→0) [f(x+h) - f(x)] / h",
                     "Difficulty Level": "Medium",
                     "Related Terms": ["Integral", "Limit", "Differentiation"],
                     "Year Introduced": "1675",
                     "Field of Study": "Calculus"
                     },
                    {"Term": "Prime Number", 
                     "Definition": "A natural number greater than 1 that has no positive divisors other than 1 and itself.", 
                     "Category": "Number Theory",
                     "Example": "The number 7 is prime because it can only be divided by 1 and 7.",
                     "Formula": "No specific formula; prime numbers are identified through divisibility rules.",
                     "Difficulty Level": "Easy",
                     "Related Terms": ["Composite Number", "Factorization", "Divisibility"],
                     "Year Introduced": "Ancient",
                     "Field of Study": "Number Theory"
                     },
                    {"Term": "Pythagorean Theorem",
                     "Definition": "A fundamental relation in Euclidean geometry among the three sides of a right triangle.", 
                     "Category": "Geometry",
                     "Example": "In a right triangle with sides a=3, b=4, the hypotenuse c=5 (3^2 + 4^2 = 5^2).",
                     "Formula": "c^2 = a^2 + b^2",
                     "Difficulty Level": "Easy",
                     "Related Terms": ["Right Triangle", "Hypotenuse", "Trigonometry"],
                     "Year Introduced": "circa 500 BCE",
                     "Field of Study": "Geometry"
                     },
                    {"Term": "Integral",
                     "Definition": "A mathematical object that represents the area under a curve.",
                     "Category": "Calculus",
                     "Example": "The integral of f(x) = x is (1/2)x^2 + C.",
                     "Formula": "∫f(x) dx",
                     "Difficulty Level": "Hard",
                     "Related Terms": ["Derivative", "Limit", "Antiderivative"],
                     "Year Introduced": "1675",
                     "Field of Study": "Calculus"
                     },
                    {"Term": "Vector", 
                     "Definition": "A quantity that has both magnitude and direction.",
                     "Category": "Algebra",
                     "Example": "A vector v = (3, 4) has a magnitude of 5 and direction in a 2D plane.",
                     "Formula": "Magnitude: |v| = √(x^2 + y^2)",
                     "Difficulty Level": "Medium",
                     "Related Terms": ["Scalar", "Matrix", "Dot Product"],
                     "Year Introduced": "1881",
                     "Field of Study": "Linear Algebra"
                     },
                    {"Term": "Polynomial", 
                     "Definition": "An expression consisting of variables and coefficients.",
                     "Category": "Algebra",
                     "Example": "f(x) = 2x^3 - 4x + 1 is a polynomial.",
                     "Formula": "General form: a_n*x^n + a_(n-1)*x^(n-1) + ... + a_1*x + a_0",
                     "Difficulty Level": "Medium","Related Terms": ["Monomial", "Degree", "Root"],
                     "Year Introduced": "Ancient",
                     "Field of Study": "Algebra"
                     },
                    {"Term": "Limit", 
                     "Definition": "The value that a function approaches as the input approaches some value.",
                     "Category": "Calculus",
                     "Example": "lim(x→0) (sin(x)/x) = 1",
                     "Formula": "lim(x→a) f(x)",
                     "Difficulty Level": "Medium",
                     "Related Terms": ["Derivative", "Continuity", "Asymptote"],
                     "Year Introduced": 1821,
                     "Field of Study": "Calculus"},
                    {"Term": "Set", 
                    "Definition": "A collection of distinct objects considered as an object in its own right.", 
                    "Category": "Set Theory",
                    "Example": "The set of prime numbers less than 10 is {2, 3, 5, 7}.",
                    "Formula": "No specific formula; operations include union, intersection, etc.",
                    "Difficulty Level": "Easy",
                    "Related Terms": ["Subset", "Union", "Intersection"],
                    "Year Introduced": "1874",
                    "Field of Study": "Set Theory"
                    },
                    {"Term": "Factorial", 
                    "Definition": "The product of all positive integers less than or equal to a given positive integer.", 
                    "Category": "Combinatorics",
                    "Example": "5! = 5 * 4 * 3 * 2 * 1 = 120",
                    "Formula": "n! = n * (n-1) * ... * 2 * 1",
                    "Difficulty Level": "Easy",
                    "Related Terms": ["Permutation", "Combination", "Binomial Theorem"],
                    "Year Introduced": "1677",
                    "Field of Study": "Combinatorics"
                    },
                    {"Term": "Binomial Theorem", 
                    "Definition": "A formula that provides the expansion of powers of a binomial.",
                    "Category": "Algebra",
                    "Example": "(a + b)^2 = a^2 + 2ab + b^2",
                    "Formula": "(a + b)^n = Σ (n choose k) * a^(n-k) * b^k",
                    "Difficulty Level": "Medium",
                    "Related Terms": ["Factorial", "Combination", "Polynomial"],
                    "Year Introduced": "1665",
                    "Field of Study": "Algebra"
                    },
    {
        "Term": "Permutation", 
        "Definition": "An arrangement of objects in a specific order.",
        "Category": "Combinatorics",
        "Example": "The permutations of the set {1, 2} are {1, 2} and {2, 1}.",
        "Formula": "P(n, k) = n! / (n-k)!",
        "Difficulty Level": "Medium",
        "Related Terms": ["Combination", "Factorial", "Set"],
        "Year Introduced": "1800",
        "Field of Study": "Combinatorics"
    },
    {
        "Term": "Combination", 
        "Definition": "A selection of items without considering the order.",
        "Category": "Combinatorics",
        "Example": "The combinations of the set {1, 2, 3} taken 2 at a time are {1, 2}, {1, 3}, and {2, 3}.",
        "Formula": "C(n, k) = n! / [k!(n-k)!]",
        "Difficulty Level": "Medium",
        "Related Terms": ["Permutation", "Factorial", "Binomial Theorem"],
        "Year Introduced": "1600",
        "Field of Study": "Combinatorics"
    },
    {
        "Term": "Quadratic Equation", 
        "Definition": "A second-order polynomial equation in a single variable.",
        "Category": "Algebra",
        "Example": "The equation x^2 - 4x + 4 = 0 is a quadratic equation.",
        "Formula": "ax^2 + bx + c = 0",
        "Difficulty Level": "Medium",
        "Related Terms": ["Polynomial", "Discriminant", "Roots"],
        "Year Introduced": "Ancient",
        "Field of Study": "Algebra"
    },
    {
        "Term": "Asymptote", 
        "Definition": "A line that a curve approaches but never touches.",
        "Category": "Geometry",
        "Example": "The line y = 0 is a horizontal asymptote of the curve y = 1/x.",
        "Formula": "No specific formula; identified through limits.",
        "Difficulty Level": "Medium",
        "Related Terms": ["Limit", "Infinity", "Hyperbola"],
        "Year Introduced": "Ancient",
        "Field of Study": "Geometry"
    },
    {
        "Term": "Exponentiation", 
        "Definition": "The operation of raising one number to the power of another.",
        "Category": "Algebra",
        "Example": "2^3 = 2 * 2 * 2 = 8",
        "Formula": "a^b, where a is the base and b is the exponent.",
        "Difficulty Level": "Easy",
        "Related Terms": ["Logarithm", "Power", "Root"],
        "Year Introduced": "Ancient",
        "Field of Study": "Algebra"
    },
    {
        "Term": "Vector Space", 
        "Definition": "A collection of vectors that can be added together and multiplied by scalars.",
        "Category": "Linear Algebra",
        "Example": "The set of all 2D vectors forms a vector space.",
        "Formula": "No specific formula; operations include vector addition and scalar multiplication.",
        "Difficulty Level": "Hard",
        "Related Terms": ["Vector", "Linear Combination", "Basis"],
        "Year Introduced": "1888",
        "Field of Study": "Linear Algebra"
    },
    {
        "Term": "Hyperbola", 
        "Definition": "A type of smooth curve formed by intersecting a double cone.",
        "Category": "Geometry",
        "Example": "The equation x^2/a^2 - y^2/b^2 = 1 represents a hyperbola.",
        "Formula": "(x^2/a^2) - (y^2/b^2) = 1",
        "Difficulty Level": "Medium",
        "Related Terms": ["Ellipse", "Parabola", "Asymptote"],
        "Year Introduced": "Ancient",
        "Field of Study": "Geometry"
    },
    {
        "Term": "Differential Equation", 
        "Definition": "An equation involving derivatives of a function.",
        "Category": "Calculus",
        "Example": "The equation dy/dx = 3x^2 is a differential equation.",
        "Formula": "F(x, y, y') = 0, where y' is the derivative of y with respect to x.",
        "Difficulty Level": "Hard",
        "Related Terms": ["Derivative", "Integral", "Initial Value Problem"],
        "Year Introduced": "1671",
        "Field of Study": "Calculus"
    },
    {
        "Term": "Bayes' Theorem", 
        "Definition": "A formula that describes how to update the probabilities of hypotheses when given evidence.",
        "Category": "Statistics",
        "Example": "If a test for a disease is 99% accurate and 1% of people have the disease, Bayes' Theorem can calculate the probability that someone who tested positive actually has the disease.",
        "Formula": "P(A|B) = [P(B|A) * P(A)] / P(B)",
        "Difficulty Level": "Hard",
        "Related Terms": ["Probability", "Conditional Probability", "Hypothesis Testing"],
        "Year Introduced": "1763",
        "Field of Study": "Statistics"
    }
]

In [5]:
df = pd.DataFrame(math_definitions)
df

Unnamed: 0,Term,Definition,Category,Example,Formula,Difficulty Level,Related Terms,Year Introduced,Field of Study
0,Matrix,A rectangular array of numbers arranged in row...,Algebra,"A 2x2 matrix: [[1, 2], [3, 4]]",No specific formula; matrix operations include...,Medium,"[Determinant, Eigenvalue, Vector]",1850,Linear Algebra
1,Derivative,The rate at which a function is changing at an...,Calculus,The derivative of f(x) = x^2 is f'(x) = 2x.,f'(x) = lim(h→0) [f(x+h) - f(x)] / h,Medium,"[Integral, Limit, Differentiation]",1675,Calculus
2,Prime Number,A natural number greater than 1 that has no po...,Number Theory,The number 7 is prime because it can only be d...,No specific formula; prime numbers are identif...,Easy,"[Composite Number, Factorization, Divisibility]",Ancient,Number Theory
3,Pythagorean Theorem,A fundamental relation in Euclidean geometry a...,Geometry,"In a right triangle with sides a=3, b=4, the h...",c^2 = a^2 + b^2,Easy,"[Right Triangle, Hypotenuse, Trigonometry]",circa 500 BCE,Geometry
4,Integral,A mathematical object that represents the area...,Calculus,The integral of f(x) = x is (1/2)x^2 + C.,∫f(x) dx,Hard,"[Derivative, Limit, Antiderivative]",1675,Calculus
5,Vector,A quantity that has both magnitude and direction.,Algebra,"A vector v = (3, 4) has a magnitude of 5 and d...",Magnitude: |v| = √(x^2 + y^2),Medium,"[Scalar, Matrix, Dot Product]",1881,Linear Algebra
6,Polynomial,An expression consisting of variables and coef...,Algebra,f(x) = 2x^3 - 4x + 1 is a polynomial.,General form: a_n*x^n + a_(n-1)*x^(n-1) + ... ...,Medium,"[Monomial, Degree, Root]",Ancient,Algebra
7,Limit,The value that a function approaches as the in...,Calculus,lim(x→0) (sin(x)/x) = 1,lim(x→a) f(x),Medium,"[Derivative, Continuity, Asymptote]",1821,Calculus
8,Set,A collection of distinct objects considered as...,Set Theory,"The set of prime numbers less than 10 is {2, 3...","No specific formula; operations include union,...",Easy,"[Subset, Union, Intersection]",1874,Set Theory
9,Factorial,The product of all positive integers less than...,Combinatorics,5! = 5 * 4 * 3 * 2 * 1 = 120,n! = n * (n-1) * ... * 2 * 1,Easy,"[Permutation, Combination, Binomial Theorem]",1677,Combinatorics


In [6]:
sen = input('search:')

In [7]:
string.punctuation
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [19]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    output = ' '.join(filtered_words)
    return output

In [20]:
lemmatizer = WordNetLemmatizer()
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
pos_tagged = nltk.pos_tag(nltk.word_tokenize(sen.lower()))
print(pos_tagged)
wordnet_tagged = list(map(lambda x: (x[0],pos_tagger(x[1])),pos_tagged))
print(wordnet_tagged)
lemmatized_sentence = []
for word, tag in wordnet_tagged:
    if tag is None:
        lemmatized_sentence.append(word)
    else:
        lemmatized_sentence.append(lemmatizer.lemmatize(word,tag))
lemmatized_sentence = ' '.join(lemmatized_sentence)
print(lemmatized_sentence)

[('longest', 'JJS'), ('permutation', 'NN'), ('equation', 'NN'), ('sol', 'NN'), ('formula', 'NN')]
[('longest', 'a'), ('permutation', 'n'), ('equation', 'n'), ('sol', 'n'), ('formula', 'n')]
long permutation equation sol formula


In [21]:
from textblob import TextBlob
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

In [22]:
sen1 = correct_spelling(lemmatized_sentence)
sen2 = remove_punctuation(sen1.lower())
sen3 = remove_stopwords(sen2)
tokenized = word_tokenize(sen3) 
tokenized

['long', 'permutations', 'equation', 'sol', 'formula']

In [23]:
def find_most_similar_column_and_rows(tokenized, df, columns):
    def get_text_similarity(text1, text2):
        words1 = word_tokenize(text1.lower())
        words2 = word_tokenize(text2.lower())
        synsets1 = [wordnet.synsets(word) for word in words1]
        synsets2 = [wordnet.synsets(word) for word in words2]
        max_similarity = 0
        for synset1_list in synsets1:
            for synset1 in synset1_list:
                for synset2_list in synsets2:
                    for synset2 in synset2_list:
                        simi = synset1.path_similarity(synset2)
                        if simi is not None and simi > max_similarity:
                            max_similarity = simi
        return max_similarity
    all_rows = [] 
    for word in tokenized:
        synset1 = wordnet.synsets(word)
        column_similarities = {col: 0 for col in columns}
        column_counts = {col: 0 for col in columns}
        column_rows = {col: [] for col in columns}  
        for col in columns:
            total_similarity = 0
            count = 0
            rows_with_high_similarity = []
            if isinstance(df[col].iloc[0], list):
                for idx, term_list in enumerate(df[col]):
                    if not isinstance(term_list, list):
                        continue
                    for x in term_list:
                        x = str(x)
                        synset2 = wordnet.synsets(x)
                        for s1 in synset1:
                            for s2 in synset2:
                                simi = s1.path_similarity(s2)
                                if simi is not None and simi > 0.5:
                                    total_similarity += simi
                                    count += 1
                                    rows_with_high_similarity.append(idx)
                                    break
            elif isinstance(df[col].iloc[0], str):
                for idx, text in enumerate(df[col]):
                    text = str(text)
                    similarity = get_text_similarity(word, text)
                    if similarity > 0.5:
                        total_similarity += similarity
                        count += 1
                        rows_with_high_similarity.append(idx)
            else:
                for idx, x in enumerate(df[col]):
                    x = str(x)
                    synset2 = wordnet.synsets(x)
                    for s1 in synset1:
                        for s2 in synset2:
                            simi = s1.path_similarity(s2)
                            if simi is not None and simi > 0.5:
                                total_similarity += simi
                                count += 1
                                rows_with_high_similarity.append(idx)
                                break

            avg_similarity = total_similarity / count if count > 0 else 0
            column_similarities[col] = avg_similarity
            column_counts[col] = count
            column_rows[col] = list(set(rows_with_high_similarity))
        best_column = max(column_similarities, key=column_similarities.get)
        best_rows = column_rows[best_column]
        rows_df = df.iloc[best_rows].reset_index(drop=True)
        all_rows.append(rows_df) 
    df1 = pd.concat(all_rows, ignore_index=True)

    return df1

In [24]:
df1 = find_most_similar_column_and_rows(tokenized, df, df.columns)
df1

Unnamed: 0,Term,Definition,Category,Example,Formula,Difficulty Level,Related Terms,Year Introduced,Field of Study
0,Permutation,An arrangement of objects in a specific order.,Combinatorics,"The permutations of the set {1, 2} are {1, 2} ...","P(n, k) = n! / (n-k)!",Medium,"[Combination, Factorial, Set]",1800,Combinatorics
1,Differential Equation,An equation involving derivatives of a function.,Calculus,The equation dy/dx = 3x^2 is a differential eq...,"F(x, y, y') = 0, where y' is the derivative of...",Hard,"[Derivative, Integral, Initial Value Problem]",1671,Calculus
2,Quadratic Equation,A second-order polynomial equation in a single...,Algebra,The equation x^2 - 4x + 4 = 0 is a quadratic e...,ax^2 + bx + c = 0,Medium,"[Polynomial, Discriminant, Roots]",Ancient,Algebra
3,Binomial Theorem,A formula that provides the expansion of power...,Algebra,(a + b)^2 = a^2 + 2ab + b^2,(a + b)^n = Σ (n choose k) * a^(n-k) * b^k,Medium,"[Factorial, Combination, Polynomial]",1665,Algebra
4,Bayes' Theorem,A formula that describes how to update the pro...,Statistics,If a test for a disease is 99% accurate and 1%...,P(A|B) = [P(B|A) * P(A)] / P(B),Hard,"[Probability, Conditional Probability, Hypothe...",1763,Statistics
5,Polynomial,An expression consisting of variables and coef...,Algebra,f(x) = 2x^3 - 4x + 1 is a polynomial.,General form: a_n*x^n + a_(n-1)*x^(n-1) + ... ...,Medium,"[Monomial, Degree, Root]",Ancient,Algebra
