In [1]:
import json
from collections import Counter
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.data import find
import gensim
import sklearn
from sympy.parsing.sympy_parser import parse_expr

In [2]:
np.random.seed(0)
nltk.download('word2vec_sample')

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\chatu\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


True

In [None]:
class Text2SQLParser:
	def __init__(self):
		"""
		Basic Text2SQL Parser. This module just attempts to classify the user queries into different "categories" of SQL queries.
		"""
		self.parser_files = "data/semantic-parser"
		self.word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
		self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(self.word2vec_sample, binary=False)

		self.train_file = "sql_train.tsv"
		self.test_file = "sql_val.tsv"

	def load_data(self):
		"""
		Load the data from file.

		Parameters
		----------
			
		Returns
		-------
		"""
		self.train_df = pd.read_csv(self.parser_files + "/" + self.train_file, sep="\t")
		self.test_df = pd.read_csv(self.parser_files + "/" + self.test_file, sep="\t")

		self.ls_labels = list(self.train_df["Label"].unique())

	def predict_label_using_keywords(self, question):
		"""
		Predicts the label for the question using custom-defined keywords.

		Parameters
		----------
		question: str
			The question whose label is to be predicted.
				
		Returns
		-------
		label: str
			The predicted label.
		"""
		# Convert to lowercase for case-insensitive matching
		question_lower = question.lower()
		
		# Define keyword dictionaries with weights for each category
		# Higher weights for more specific/strong indicators
		keywords = {
			"comparison": {
				"greater than": 2, "less than": 2, "equal to": 2, "more than": 2, "fewer than": 2, 
				"between": 1.5, "compare": 1, "highest": 1.5, "lowest": 1.5, "maximum": 2, 
				"minimum": 2, "largest": 1.5, "smallest": 1.5, "where": 1.5, "exceed": 1,
				"condition": 1.5, "filter": 1.5, "than": 1, "not equal": 2, "whose": 1
			},
			
			"grouping": {
				"group by": 3, "grouped by": 3, "for each": 2, "per": 1.5, "group": 2,
				"average of": 2, "sum of": 2, "count of": 2, "grouped": 2, 
				"categories": 1, "summarize": 1.5, "aggregate": 2, "having": 2,
				"average": 2, "sum": 2, "count": 2, "total": 1
			},
			
			"ordering": {
				"order by": 3, "sort by": 3, "arrange by": 3, "rank": 2, "top": 2, 
				"ascending": 2, "descending": 2, "highest to lowest": 2.5, 
				"lowest to highest": 2.5, "ordered": 2, "sorted": 2, "order": 1.5, 
				"sort": 1.5, "arrange": 1.5, "limit": 2, "first": 1.5
			},
			
			"multi_table": {
				"join": 3, "both tables": 3, "across tables": 3, "multiple tables": 3,
				"related to": 2, "connection between": 2, "linking": 2, "relationship": 2,
				"from both": 2, "inner join": 3, "outer join": 3, "left join": 3, 
				"tables": 1.5, "foreign key": 3, "two tables": 3
			}
		}
		
		# Calculate score for each category
		scores = {category: 0 for category in keywords.keys()}
		
		# Check for keyword matches and add corresponding weights
		for category, kw_dict in keywords.items():
			for kw, weight in kw_dict.items():
				if kw in question_lower:
					scores[category] += weight
		
		# Get the highest score
		max_score = max(scores.values())
		
		# If no keywords match, default to comparison (typically most common)
		if max_score == 0:
			return "comparison"
		
		# If there's a tie, implement a priority order
		# Prioritize more complex operations first
		if list(scores.values()).count(max_score) > 1:
			for category in ["multi_table", "grouping", "ordering", "comparison"]:
				if scores[category] == max_score:
					return category
		
		# Return the category with the highest score
		return max(scores, key=scores.get)

	def evaluate_accuracy(self, prediction_function_name):
		"""
		Gives label wise accuracy of your model.

		Parameters
		----------
		prediction_function_name: Callable
			The function used for predicting labels.
			
		Returns
		-------
		accs: dict
			The accuracies of predicting each label.
		main_acc: float
			The overall average accuracy
		"""
		correct = Counter()
		total = Counter()
		main_acc = 0
		main_cnt = 0
		for i in range(len(self.test_df)):
			q = self.test_df.loc[i]["Question"].split(":")[1].split("|")[0].strip()
			gold_label = self.test_df.loc[i]['Label']
			if prediction_function_name(q) == gold_label:
				correct[gold_label] += 1
				main_acc += 1
			total[gold_label] += 1
			main_cnt += 1
		accs = {}
		for label in self.ls_labels:
			accs[label] = (correct[label]/total[label])*100
		return accs, 100*main_acc/main_cnt

	def get_sentence_representation(self, sentence):
		"""
		Gives the average word2vec representation of a sentence.

		Parameters
		----------
		sentence: str
			The sentence whose representation is to be returned.
			
		Returns
		-------
		sentence_vector: np.ndarray
			The representation of the sentence.
		"""
		# Fill in your code here
		sentence_vector = np.zeros(300)

		return sentence_vector
	
	def init_ml_classifier(self):
		"""
		Initializes the ML classifier.

		Parameters
		----------
			
		Returns
		-------
		"""
		# Fill in your code here
		self.classifier = None
	
	def train_label_ml_classifier(self):
		"""
		Train the classifier.

		Parameters
		----------
			
		Returns
		-------
		"""
		# Fill in your code here
		pass
	
	def predict_label_using_ml_classifier(self, question):
		"""
		Predicts the label of the question using the classifier.

		Parameters
		----------
		question: str
			The question whose label is to be predicted.
			
		Returns
		-------
		predicted_label: str
			The predicted label.
		"""
		# Fill in your code here
		predicted_label = ""

		return predicted_label
