In [11]:
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import streamlit as st

In [3]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Load the admission data from the CSV file
data = pd.read_csv('/content/admission_data - Sheet1.csv')

In [5]:
# Extract queries and responses from the DataFrame
admission_queries = data['Query'].tolist()
admission_responses = data['Intent'].tolist()


In [6]:
# Tokenize and preprocess the admission queries
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return ' '.join(tokens)

preprocessed_queries = [preprocess_text(query) for query in admission_queries]

In [8]:

# Create a TF-IDF vectorizer and transform the queries
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_queries)


In [9]:
# Function to handle user queries
def handle_admission_query(user_query):
    preprocessed_user_query = preprocess_text(user_query)
    tfidf_user_query = vectorizer.transform([preprocessed_user_query])

    # Calculate cosine similarities between user query and admission queries
    similarities = cosine_similarity(tfidf_user_query, tfidf_matrix)

    # Find the most similar admission query
    most_similar_index = similarities.argmax()
    if similarities[0][most_similar_index] > 0.2:
        return admission_responses[most_similar_index]
    else:
        return "I'm sorry, I don't have information about that specific query."

In [13]:
# Streamlit UI
st.title("Admission Chatbot")

2024-04-05 09:08:58.262 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [15]:
user_query = st.text_input("Enter your question:")
if st.button("Ask"):
    response = handle_admission_query(user_query)
    st.text("Chatbot: " + response)
