In [7]:
import nltk
import streamlit as st
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity      
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# import spacy
lemmatizer = nltk.stem.WordNetLemmatizer()
# Download required NLTK data
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [41]:
df = pd.read_csv('Samsung Dialog.txt', sep = ':', header = None)
df.head(10)

# This is a function from the pandas library used to read a comma-separated values (CSV) file into a DataFrame.
# sep=':': This parameter specifies the delimiter used in the file. In this case, it's set to ':' because the data is separated by colons.
# header=None: This parameter indicates that the file doesn't have a header row. 

Unnamed: 0,0,1
0,Customer,"Hi, I'm looking to buy a new phone, and I'm i..."
1,Sales Agent,"Great, we have a wide range of Samsung phones..."
2,Customer,"Well, I want a phone with a good camera, long..."
3,Sales Agent,Absolutely. We have a lot of great options th...
4,Customer,"No, I haven't. Tell me more about it."
5,Sales Agent,The Galaxy S21 Ultra has a 108-megapixel came...
6,Customer,That sounds great. How much does it cost?
7,Sales Agent,"The Galaxy S21 Ultra starts at $1,199, but we..."
8,Customer,"Okay, I'm interested. But I have a few more q..."
9,Sales Agent,The Galaxy S21 Ultra comes with a standard on...


In [46]:
# Iterate through rows and assign values to 'Row' and 'columns'

cust = df.loc[df[0] == 'Customer']
sales = df.loc[df[0] == 'Sales Agent']

In [47]:
cust.rename(columns = {1: 'Customer'}, inplace = True)
cust = cust[['Customer']].reset_index(drop = True)
sales.rename(columns = {1: 'Sales Agent'}, inplace = True)
sales = sales[['Sales Agent']].reset_index(drop = True)


In [48]:
datax = pd.concat([cust, sales], axis = 1)

In [71]:
# Define a function for text preprocessing (including lemmatization)
def preprocess_text(text):
    global tokens
    # Identifies all sentences in the data
    sentences = nltk.sent_tokenize(text)
    
    # Tokenize and lemmatize each word in each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum()]
        # Turns to basic root - each word in the tokenized word found in the tokenized sentence - if they are all alphanumeric 
        # The code above does the following:
        # Identifies every word in the sentence 
        # Turns it to a lower case 
        # Lemmatizes it if the word is alphanumeric

        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return ' '.join(preprocessed_sentences)


datax['tokenized Questions'] = datax['Customer'].apply(preprocess_text)
datax.head(10)

Unnamed: 0,Customer,Sales Agent,tokenized Questions
0,"Hi, I'm looking to buy a new phone, and I'm i...","Great, we have a wide range of Samsung phones...",hi i looking to buy a new phone and i interest...
1,"Well, I want a phone with a good camera, long...",Absolutely. We have a lot of great options th...,well i want a phone with a good camera long ba...
2,"No, I haven't. Tell me more about it.",The Galaxy S21 Ultra has a 108-megapixel came...,no i have tell me more about it
3,That sounds great. How much does it cost?,"The Galaxy S21 Ultra starts at $1,199, but we...",that sound great how much doe it cost
4,"Okay, I'm interested. But I have a few more q...",The Galaxy S21 Ultra comes with a standard on...,okay i interested but i have a few more questi...
5,That's good to know. And what about the opera...,"Yes, the Galaxy S21 Ultra runs on Android 11,...",that good to know and what about the operating...
6,"Okay, that's good. But I'm also interested in...",Absolutely. The Galaxy A52 is a great mid-ran...,okay that good but i also interested in some o...
7,That sounds like a good option for me. How mu...,"The Galaxy A52 starts at $399, but again, we ...",that sound like a good option for me how much ...
8,"Okay, I'll think about it. But can you also t...",Of course. The Galaxy Z Fold2 is a really uni...,okay i think about it but can you also tell me...
9,"That sounds really cool, but it also sounds e...","The Galaxy Z Fold2 starts at $1,999, but agai...",that sound really cool but it also sound expen...


In [52]:
corpus = datax['tokenized Questions'].to_list()
corpus

#------corpus will be a Python list containing the values from the 'tokenized Questions' column of 'datax' wch is my dataframe. 
# Basically, this is the repository of the tokenized questions

['hi i looking to buy a new phone and i interested in samsung phone',
 'well i want a phone with a good camera long battery life and plenty of storage',
 'no i have tell me more about it',
 'that sound great how much doe it cost',
 'okay i interested but i have a few more question what kind of warranty come with the phone',
 'that good to know and what about the operating system doe it come with the latest version of android',
 'okay that good but i also interested in some of the other samsung phone can you tell me more about the galaxy a52',
 'that sound like a good option for me how much doe it cost',
 'okay i think about it but can you also tell me about the galaxy z fold2 i heard a lot about it and i curious',
 'that sound really cool but it also sound expensive how much doe it cost',
 'hmm i not sure that a lot of money for a phone',
 'okay can you tell me more about the galaxy a72',
 'that sound like a great option for me how much doe it cost',
 'okay i definitely consider it but

In [53]:
#---vectorisation
# this is the process of converting words or text in a sentence into numbers

tfidf_vector = TfidfVectorizer()
v_corpus = tfidf_vector.fit_transform(corpus)
#--Using Vectorizer to convert a collection of TEXT or RAW documents in (corpus) to a matrix of TF-IDF feature, i.e to numerical that the machine understand

In [72]:
#TEXTING
input_text = "i want to buy a new phone"

In [73]:
user_input_processed = preprocess_text(input_text) 
user_input_processed
#...preprocess the user"s input using the preprocess_text function

'i want to buy a new phone'

In [74]:
v_input = tfidf_vector.transform([user_input_processed])
print(v_input)

#--using the TF-IDF vectorizer to convert it into a numerical representation 

  (0, 86)	0.5010482537151495
  (0, 84)	0.39242273305294606
  (0, 67)	0.3047233842852462
  (0, 58)	0.5010482537151495
  (0, 11)	0.5010482537151495


In [77]:
most_similar = cosine_similarity(v_input, v_corpus)
# most_similar.argmax()
most_similar  
# navigating into this to see the highest number, then re-comment the argmax to get the index postion

array([[0.57939188, 0.21521396, 0.        , 0.        , 0.06086571,
        0.08912855, 0.05401137, 0.        , 0.        , 0.        ,
        0.07389345, 0.        , 0.        , 0.        , 0.14831135,
        0.        , 0.        ]])

In [78]:
datax['Sales Agent'].iloc[0]

' Great, we have a wide range of Samsung phones to choose from. What features are you looking for in a phone?'

In [79]:
#-------------------------------STREAMLIT DESIGN-------------------------------



# instead of running the code one by one, it can just be done in a function

def response(user_input):
    user_input_processed = preprocess_text(user_input)
    v_input = tfidf_vector.transform([user_input_processed])
    most_similar = cosine_similarity(v_input, v_corpus)
    most_similar_index = most_similar.argmax()
    
    return datax['Sales Agent'].iloc[most_similar_index]
response('i want to buy a new smartphone')

' Great, we have a wide range of Samsung phones to choose from. What features are you looking for in a phone?'

In [81]:
chatbot_greeting = [
    "Hello there, welcome to Emjay bot. pls enjoy your usage",
    "Hi user, This bot is created by Emjay, enjoy your usage",
    "Hi hi, How you dey my nigga",
    "Alaye mi, Abeg enjoy your usage",
    "Hey Hey, pls enjoy your usage"    
]

user_greeting = ["hi", "hello there", "hey", "hi there"]
exit_word = ['bye', 'thanks bye', 'exit', 'goodbye']


print(f'\t\t\t\t\tWelcome To Emjay ChatBot\n\n')
while True:
    user_q = input('Pls ask your Samsung questions: ')
    if user_q in user_greeting:
        print(random.choice(chatbot_greeting))
    elif user_q in exit_word:
    
        print('Thank you for your usage. Bye')
        break
    else:
        responses = response(user_q)
        print(f'ChatBot: {responses}')

					Welcome To Emjay ChatBot


Hi user, This bot is created by Emjay, enjoy your usage
ChatBot:  Absolutely. The Galaxy A52 is a great mid-range phone that has a lot of the same features as the Galaxy S21 Ultra, but at a more affordable price point. It has a 64-megapixel camera, a 4,500mAh battery, and up to 256GB of storage. It also runs on the latest version of Android and has a really nice, large display.
Thank you for your usage. Bye
