In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import random
import dill

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [2]:
df = pd.read_csv('dataset_faq_final.csv')
print(df.shape)  
print(df.head(10))  

(128, 3)
                                            question                  intent  \
0  What are the admission requirements for underg...  admission_requirements   
1                 How can I apply for a scholarship?        scholarship_info   
2  What is the application deadline for the fall ...    application_deadline   
3                        Do you offer financial aid?           financial_aid   
4                                Can I apply online?     application_process   
5             Are there any entrance exams required?           entrance_exam   
6  What documents are needed for international st...  international_students   
7                       Is there an application fee?         application_fee   
8       How do I check the status of my application?      application_status   
9                          Can I defer my admission?      admission_deferral   

                                            response  
0  The admission requirements for undergraduate p...  


In [3]:
print("\nUnique intent:")
print(df['intent'].unique())


Unique intent:
['admission_requirements' 'scholarship_info' 'application_deadline'
 'financial_aid' 'application_process' 'entrance_exam'
 'international_students' 'application_fee' 'application_status'
 'admission_deferral' 'programs_offered' 'admission_process_time'
 'campus_tours' 'housing' 'gpa_requirement' 'transfer_credits'
 'orientation' 'refund_policy' 'part_time_study' 'contact_info'
 'late_application' 'online_courses' 'language_instruction'
 'age_restrictions' 'multiple_programs' 'student_faculty_ratio'
 'visa_assistance' 'extracurricular_activities' 'scholarship_deadline'
 'transcript_submission' 'course_withdrawal' 'international_programs'
 'program_change' 'tuition_fees' 'internships' 'academic_probation'
 'language_proficiency' 'housing_application' 'student_support'
 'dress_code' 'credit_load' 'exchange_programs' 'student_id'
 'attendance_policy' 'program_withdrawal_refund' 'library_hours'
 'evening_classes' 'lab_fees' 'disability_services'
 'cross_department_courses' 

In [4]:
intent_list = df['intent'].unique().tolist()

In [5]:
nltk.download('wordnet')

def generate_keywords(intent_names):
    keywords_mapping = {}

    for intent in intent_names:
        words = intent.split('_')
        synonyms = set()

        for word in words:
            for syn in wn.synsets(word):
                for lemma in syn.lemmas():
                    lemma_name = lemma.name().replace('_', ' ')
                    if lemma_name.lower() not in words and len(lemma_name.split()) < 4:
                        synonyms.add(lemma_name.lower())

        keywords_mapping[intent] = list(set(words + list(synonyms)))



    return keywords_mapping

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sampa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
class AdmissionChatbot:
    def __init__(self, dataset_path='dataset_faq_final.csv'):
        self.df = pd.read_csv(dataset_path)
        self.responses = self.df.groupby('intent')['response'].apply(list).to_dict()
        self.stop_words = set(stopwords.words('english'))

        self.vectorizer = TfidfVectorizer()
        self.classifier = LogisticRegression(max_iter=1000)
        self.pipeline = Pipeline([
            ('tfidf', self.vectorizer),
            ('clf', self.classifier)
        ])
        self.train_model()

        # Rule-based keyword-intent mapping
        self.keywords_mapping = generate_keywords(intent_list)
        print("Generated keywords mapping:", self.keywords_mapping)

    
    def preprocess(self, sentence):
        tokens = word_tokenize(sentence.lower())
        filtered_words = [word for word in tokens if word not in self.stop_words and word.isalnum()]
        return ' '.join(filtered_words)

    def train_model(self):
        questions = self.df['question'].apply(self.preprocess)
        labels = self.df['intent']
        self.pipeline.fit(questions, labels)

    def predict_intent(self, query):
        cleaned_input = self.preprocess(query)
        predicted_intent = self.pipeline.predict([cleaned_input])[0]
        return predicted_intent

    def get_intent_by_rule(self, query):
        query_lower = query.lower()
        for intent_key, keyword_list in self.keywords_mapping.items():
            if any(keyword in query_lower for keyword in keyword_list):
                return intent_key
        return None

    def get_response(self, query):
        # try model-based prediction
        predicted_intent = self.predict_intent(query)
        if predicted_intent in self.responses:
            return random.choice(self.responses[predicted_intent])

        # If model fails, then rule-based fallback
        rule_based_intent = self.get_intent_by_rule(query)
        if rule_based_intent and rule_based_intent in self.responses:
            return random.choice(self.responses[rule_based_intent])

        # Final 
        return "I'm sorry, I don't have an answer for that."


In [7]:
# Model Export
if __name__ == "__main__":
    bot = AdmissionChatbot()    
       
with open("model.dill", "wb") as f:
    dill.dump(bot, f)



Generated keywords mapping: {'admission_requirements': ['entrance money', 'entrance fee', 'necessary', 'admission', 'necessity', 'admission price', 'entree', 'requisite', 'accession', 'demand', 'admission charge', 'admission fee', 'price of admission', 'admittance', 'access', 'requirements', 'requirement', 'prerequisite', 'essential'], 'scholarship_info': ['learning', 'scholarship', 'erudition', 'information', 'learnedness', 'encyclopaedism', 'eruditeness', 'encyclopedism', 'info'], 'application_deadline': ['deadline', 'application', 'application program', 'diligence', 'practical application', 'covering', 'coating', 'applications programme', 'lotion'], 'financial_aid': ['attention', 'tending', 'help', 'economic aid', 'assist', 'fiscal', 'assistance', 'financial', 'financial aid', 'care', 'aid'], 'application_process': ['cognitive operation', 'diligence', 'coating', 'outgrowth', 'march', 'lotion', 'unconscious process', 'procedure', 'treat', 'work on', 'applications programme', 'process