## Importing Libraries

In [1]:
import re

import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Preprocessing Text

In [2]:
t = """Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into 
electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, 
and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""

In [3]:
def preprocess_without_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed = [stemmer.stem(token) for token in lemmatized]
    return " ".join(stemmed)

In [4]:
preprocess_without_stopwords(t)

'lorem ipsum simpli dummi text print typeset industri lorem ipsum industri standard dummi text ever sinc 1500 unknown printer took galley type scrambl make type specimen book surviv five centuri also leap electron typeset remain essenti unchang popularis 1960 releas letraset sheet contain lorem ipsum passag recent desktop publish softwar like aldu pagemak includ version lorem ipsum'

In [5]:
def preprocess_with_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = nltk.word_tokenize(text.lower())
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed = [stemmer.stem(token) for token in lemmatized]
    return " ".join(stemmed)

In [6]:
preprocess_with_stopwords(t)

'lorem ipsum is simpli dummi text of the print and typeset industri lorem ipsum ha been the industri standard dummi text ever sinc the 1500 when an unknown printer took a galley of type and scrambl it to make a type specimen book it ha surviv not onli five centuri but also the leap into electron typeset remain essenti unchang it wa popularis in the 1960 with the releas of letraset sheet contain lorem ipsum passag and more recent with desktop publish softwar like aldu pagemak includ version of lorem ipsum'

## Loading Data for chat bot

In [7]:
data = pd.read_csv("data.csv")
data

Unnamed: 0,Questions,Answers
0,Education Qualification Marks Results study,"""Class 10 From Chandrakant Patil English mediu..."
1,Skills,"""Programming Languages : Python | Database Too..."
2,Contact number email mail,"""Mobile: +917899678022 | Email: akash.hiremath..."
3,GitHub Repository Projects Project,https://github.com/AkashHiremath856/
4,Blogs blog Experience,No prior experience but check out my work http...
5,Portfolio Website,https://akashhiremath856.github.io/MyPortfolio...
6,Personal Information about bio,"""Akash\nHighly motivated and detail-oriented, ..."


In [8]:
questions_list = data["Questions"]
answers_list = data["Answers"]

vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
X = vectorizer.fit_transform([preprocess_without_stopwords(q) for q in questions_list])



In [9]:
def get_response(text):
    preprocessed_text = preprocess_with_stopwords(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    similarities = cosine_similarity(vectorized_text, X)
    max_similarity = np.max(similarities)
    if max_similarity >= 0.5:
        high_similarity_questions = [
            q for q, s in zip(questions_list, similarities[0]) if s >= 0.5
        ]

        target_answers = []
        for q in high_similarity_questions:
            target_answers.append(data[data["Questions"] == q]["Answers"].values[0])
        print(target_answers[0])


get_response("email")

"Mobile: +917899678022 | Email: akash.hiremath25@gmail.com| LinkedIn: www.linkedin.com/in/akash-hiremath25"
