In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import spacy

In [5]:
train_data = pd.read_excel('TrainData/QuestionClassificationDataset.xlsx')

In [6]:
# Text preprocessing function
def preprocess_text(text):
    # Normalize text
    processed_text = text.lower()
    processed_text = processed_text.replace('?', '')
    
    # lemmatization
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(processed_text)
    processed_text = ' '.join([token.lemma_ for token in doc])

    return processed_text

train_data['processed_question'] = train_data['Question'].apply(preprocess_text)

In [7]:
with open('train_data_processed.json', 'w') as f:
    f.write(train_data.to_json())

In [8]:
# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_data['processed_question'])
y = train_data['Type']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Hyperparameter tuning
param_grid = {'n_neighbors': [3, 5, 7], 'metric': ['cosine', 'euclidean']}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)

# Evaluation
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

historicalQuestion       1.00      1.00      1.00       283
     otherQuestion       1.00      1.00      1.00        52

          accuracy                           1.00       335
         macro avg       1.00      1.00      1.00       335
      weighted avg       1.00      1.00      1.00       335



In [10]:
# Save model
import joblib
joblib.dump(grid, 'question_classifier_knn.pkl')
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']