Main

In [None]:
!pip install transformers

In [None]:
!pip install tensorflow-text

In [None]:
!pip install xlsxwriter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preparation
- Create a function that create excel to hold the results of the models (Will be used later).
- Extract the relevant data (the client sentences) from the dataset and hold it in a list.

In [None]:
import pandas
 
def creating_excel() -> pandas.DataFrame:
      # Create an Excel file
      writer = pandas.ExcelWriter('Results.xlsx', engine='xlsxwriter')

      # Set the column names
      data = {'Sentence': [],'Category': [], 'Nouns': [] }

      # Convert the dataframe to an XlsxWriter Excel object.
      output_excel = pandas.DataFrame(data)
      output_excel.to_excel(writer, sheet_name='Sheet1', index=False)
      print('Excel Created')
      return output_excel

In [None]:
import pandas
from xlsxwriter import Workbook

input_excel = pandas.read_excel('/content/drive/MyDrive/src/Chatbot dataset.xlsx') # data set

client_message = [] # 377 sentence by client

# rows num
n_rows = len(input_excel.index)

# columns num
n_cols_ = len(input_excel.columns)

for row in range(n_rows):
  username = input_excel.iloc[row][2]
  if username == 'client':
    message = input_excel.iloc[row][3]
    client_message.append(message)


# Unsupervised  classification
- First step: We use Named entity recognition with Bert module to get the nouns in the sentence.
- Second Step: We use smaller-LaBSE(Language-agnostic BERT Sentence Embedding) model to get the sentences embeddings.

# First step
We use Named entity recognition with Bert module to get the nouns in the sentence.

In [None]:
import torch
import numpy as np

from google.colab import drive
drive.mount('/content/drive')


# Load the model and the tokenizer from the downloaded files.
# I added 'map_location=torch.device('cpu')' bcz I use only cpu
model = torch.load(r"/content/drive/MyDrive/src/my_model_3.pth", map_location=torch.device('cpu'))
tokenizer = torch.load(r"/content/drive/MyDrive/src/my_tokenizer.pth", map_location=torch.device('cpu'))

all_sentence_nouns = []

# Our input
for sentence in client_message:
  tokenized_sentence = tokenizer.encode(sentence)  # list of numbers represent each word
  input_ids = torch.tensor([tokenized_sentence])  # I removed the .cuda() bcz I used only cpu on my computer


  tag_values = ['DT', 'POS', 'NNS', 'VBG', 'CD', ';', 'JJS', 'NN', 'RP', '.', 'WP', 'PRP', 'CC', 'WRB', 'RBR', 'MD', 'VBZ', 'UH', 'FW', 'PDT',
                'NNP', ':', 'JJ', 'JJR', 'RRB', '$', 'VB', ',', 'VBP', 'PRP$', 'NNPS', '``', 'IN', 'EX', 'TO', 'RB', 'VBN', 'RBS', 'WDT', 'LRB', 'VBD', 'WP$', 'PAD']

  with torch.no_grad():
      output = model(input_ids)

  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

  new_tokens, new_labels, nouns_from_sentence = [] ,[], []

  for token, label_idx in zip(tokens, label_indices[0]):
      if token.startswith("##"):
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

  for token, label in zip(new_tokens, new_labels):
      if 'NN' in label and '[SEP]' not in token and '[CLS]' not in token and '?' not in token:
          nouns_from_sentence.append(token)

  all_sentence_nouns.append(nouns_from_sentence) # [['noun1','noun2',..],[]]

print(f'all_sentence_nouns = {all_sentence_nouns}')
print(f'client_message = {client_message}')


# Second step
We use smaller-LaBSE(Language-agnostic BERT Sentence Embedding) model to get the sentences embeddings.
We have 3 options:
1. Get the vector for *all* the sentence.
2. Get the vector for a *concatenation string of the nouns* in the sentence.
3. Get a vector *for each noun in the sentence* and sum the values for each category you get and then calc the avg of all the nouns in the sentence and return the max.

For each option we check what category the sentence belong to (by calc the arithmetic distance between the vector that represent the sentence and the vectors that represent the categories).

# The first option
Get the vector for *all* the sentence:

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub
from xlsxwriter import Workbook

TRESHOLD = 0.22

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo

index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}

output_excel = creating_excel()  # create an Excel file
excel_index = 0

temp = 0 # for break
for sentence in client_message:
  # Encoding the messages and the categories sentences.
  messages_sentences = tf.constant([sentence])
  categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

  messages_embeds = model(messages_sentences)
  categories_embeds = model(categories_sentences)

  # Messages-categories similarity
  result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])

  # write the sentence in the excel
  output_excel.at[excel_index, 'Sentence'] = sentence

  for value in result: # result = [[3432 34234 234 324234 23]]
    for i,v in enumerate(value): # for each number in the list
      if float(v) > TRESHOLD: # needs to be change accorindg to the result from ChatGPT
        output_excel.at[excel_index, 'Category'] = index_category.get(i) + ','

  excel_index += 1
  if temp > 100:
    break
  temp += 1

output_excel.to_excel("Results.xlsx", index=False)  # save the Excel file



# The second option
Get the vector for a *concatenation string of the nouns* in the sentence:

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub
from xlsxwriter import Workbook

TRESHOLD = 0.3

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo
index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}

output_excel = creating_excel()  # create an Excel file
excel_index = 0

# temp = 0 # for break
for nouns,sentence in zip(all_sentence_nouns,client_message):
  
  # print(f'nouns = {nouns}') # nouns = ['places', 'students', 'building']
  # print(f'sentence = {sentence}') # "I think there should be many various sitting.."

  # when list of nouns is empty, continue to the next iteration
  if len(nouns) == 0:
    continue

  # creates a concatenated string of all nouns
  conca_string = ' '.join(nouns)

  # Encoding the messages and the categories sentences.
  messages_sentences = tf.constant([conca_string])
  categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

  messages_embeds = model(messages_sentences)
  categories_embeds = model(categories_sentences)

  # Messages-categories similarity
  result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])

  # write the sentence in the excel
  output_excel.at[excel_index, 'Sentence'] = sentence
  output_excel.at[excel_index, 'Nouns'] = conca_string

  category = ''
  for value in result: # result = [[3432 34234 234 324234 23]]
    for i,v in enumerate(value): # for each number in the list
      if float(v) > TRESHOLD: # needs to be change accorindg to the result from ChatGPT
        category += index_category.get(i) + ','
    output_excel.at[excel_index, 'Category'] = category

  excel_index += 1
  if temp > 100:
    break
  temp += 1

output_excel.to_excel("Results.xlsx", index=False)  # save the Excel file



Excel Created


# The third option
Get a vector *for each noun in the sentence* and sum the values for each category you get and then calc the avg of all the nouns in the sentence and return the max:

**example:**

  sentence = "I think there should be many various sitting and studying    places for students, both inside and outside of the building."

  nouns = places students building

  call the model on each noun -> we get [value1,value2,...,value5]

  sum all the values for each categoty and then return the category with the max value.

  

smaller-LaBSE.py

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub
#from xlsxwriter import Workbook

TRESHOLD = 0.22

# Loading models from tfhub.dev
encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

# Constructing model to encode texts into high-dimensional vectors
sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
encoder_inputs = preprocessor(sentences)
sentence_representation = encoder(encoder_inputs)["pooled_output"]
normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
model = tf.keras.Model(sentences, normalized_sentence_representation)

# Start Algo
index_category = {0:'Environment and climate resilience',1:'Mobility (transport)',2:'Local identity',3:'Future of work',4:'Land use'}

output_excel = creating_excel()  # create an Excel file
excel_index = 0
sum_result_column = [0 for i in range(5)]

temp = 0 # for break
for nouns,sentence in zip(all_sentence_nouns,client_message):
  
  # print(f'nouns = {nouns}') # nouns = ['places', 'students', 'building']
  # print(f'sentence = {sentence}') # "I think there should be many various sitting.."

  # when list of nouns is empty
  if len(nouns) == 0:
    continue

  # creates a concatenated string of all nouns
  conca_string = ' '.join(nouns)

  for noun in nouns:

    # Encoding the messages and the categories sentences.
    messages_sentences = tf.constant([noun])
    categories_sentences = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

    messages_embeds = model(messages_sentences)
    categories_embeds = model(categories_sentences)

    # Messages-categories similarity
    result = tf.tensordot(messages_embeds, categories_embeds, axes=[[1], [1]])

    # write the sentence in the excel
    output_excel.at[excel_index, 'Sentence'] = sentence
    output_excel.at[excel_index, 'Nouns'] = conca_string

    for value in result: # result = [[3432 34234 234 324234 23]]
      for i,v in enumerate(value): # for each number in the list
        sum_result_column[i] += v

  category = ''
  for i,value in enumerate(sum_result_column):
    if float(value) > TRESHOLD:
      category += index_category.get(i) + ','
  output_excel.at[excel_index, 'Category'] = category

  print(f'temp = {temp}')
  excel_index += 1
  if temp > 60:
    break
  temp += 1

output_excel.to_excel("Results.xlsx", index=False)  # save the Excel file

# Supervised classification
- Use ChatGPT API to classify the sentences to the right categories 

# Algorithm

- Create a function that get a sentence as input and return a list of the nouns

- Create a function that get a sentence as input and return the cos similarity between the sentence and the 5 categories 

- Create a function that get a sentence, nouns and cos similarity of the sentence and return the classification from Chat GPT for this sentence. Send a querry to ChatGPT with the sentence, the nouns of the sentence and the cos similarity of the sentence with the 5 categories.

- Train a model using the classifier adaboost and the embedding TF IDF to get the best result. 

In [None]:
import torch
import numpy as np
import re

def get_nouns(sentence):
    # Load the model and the tokenizer from the downloaded files.
    # I added 'map_location=torch.device('cpu')' bcz I use only cpu
    model = torch.load(r"/content/drive/MyDrive/src/my_model_3.pth", map_location=torch.device('cpu'))
    tokenizer = torch.load(r"/content/drive/MyDrive/src/my_tokenizer.pth", map_location=torch.device('cpu'))

    # Our input
    tokenized_sentence = tokenizer.encode(sentence)  # list of numbers represent each word
    input_ids = torch.tensor([tokenized_sentence])  # I removed the .cuda() bcz I used only cpu on my computer


    tag_values = ['DT', 'POS', 'NNS', 'VBG', 'CD', ';', 'JJS', 'NN', 'RP', '.', 'WP', 'PRP', 'CC', 'WRB', 'RBR', 'MD', 'VBZ', 'UH', 'FW', 'PDT',
                'NNP', ':', 'JJ', 'JJR', 'RRB', '$', 'VB', ',', 'VBP', 'PRP$', 'NNPS', '``', 'IN', 'EX', 'TO', 'RB', 'VBN', 'RBS', 'WDT', 'LRB', 'VBD', 'WP$', 'PAD']

    with torch.no_grad():
        output = model(input_ids)

    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

    # join bpe split tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

    new_tokens, new_labels, nouns_from_sentence = [] ,[], []

    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    ans = ""
    for token, label in zip(new_tokens, new_labels):
        ans+="{}\t{}".format(label, token)
        ans+="\n"
    nouns_from_sentence = re.findall(r'NN\w*\s+(\w+)', ans)

    return nouns_from_sentence

In [None]:
# Example of using the function
get_nouns("I think there should be many various sitting and studying places for students, both inside and outside of the building.")

['places', 'students', 'building']

In [None]:
import tensorflow as tf
import tensorflow_text  # noqa
import tensorflow_hub as hub
import numpy as np

def get_c_similarity(sentence):
    # Loading models from tfhub.dev
    encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
    preprocessor = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

    # Constructing model to encode texts into high-dimensional vectors
    sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
    encoder_inputs = preprocessor(sentences)
    sentence_representation = encoder(encoder_inputs)["pooled_output"]
    normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
    model = tf.keras.Model(sentences, normalized_sentence_representation)

    # Encoding sentences.
    CheckSentence = tf.constant([sentence])
    Categories = tf.constant(["Environment and climate resilience", "Mobility (transport)", "Local identity", "Future of work", "Land use"])

    sentence_embeds = model(CheckSentence)
    categories_embeds = model(Categories)

    # sentence-categories similarity to list
    tensor_list = tf.tensordot(sentence_embeds, categories_embeds, axes=[[1], [1]]).numpy().tolist()
    
    return tensor_list

In [None]:
# Example of using the function
get_c_similarity("I think there should be many various sitting and studying places for students, both inside and outside of the building.")

[[0.20263248682022095,
  0.06531611829996109,
  0.0811031311750412,
  0.1793404370546341,
  0.0731455385684967]]

In [None]:
!pip install openai

In [None]:
!pip install rollbar

In [None]:
import openai
import rollbar
rollbar.init('3524a066b047491b9d777810d89dbfe4', 'testenv')
# Set up the OpenAI API client
openai.api_key = "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"

In [None]:
def ask_chatgpt(question):
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        n=1,
        messages=[
            {"role": "system", "content": "You are a chatbot"},
            {"role": "user", "content": question},
        ])

    result = ''
    for choice in response.choices:
     result += choice.message.content
     return (result)

def gpt_ans(test_sentence, nouns, c_similarity_list):
 query = f"""for this sentance: {test_sentence} the nouns are: {nouns} and the cos similarity is: {c_similarity_list} Now I want you to tell me,
 given the nouns in the sentence and the cos similarity, for each of the five key areas, does the sentence fall. please write your answer in the following format:
 1. Environment and climate resilience: Yes/No
 2. Mobility (transport): Yes/No
 3. local identity: Yes/No
 4. future of work: Yes/No
 5. land use: Yes/No
 if you cannot provide an answer for the five key areas, return 'No' for each key area with the format above.
 if there are no nouns in the sentence, still classify each of the five key areas, does the sentence fall with the format above"""
 try:
     return ask_chatgpt(query)
 except Exception as e:
     # monitor exception using Rollbar
     rollbar.report_exc_info()
     return e

In [None]:
# Example of using the function
sen = "I think there should be many various sitting and studying places for students, both inside and outside of the building."
nouns = get_nouns(sen)
c_sim = get_c_similarity(sen)
example = gpt_ans(sen, nouns, c_sim)
example

'1. Environment and climate resilience: No\n2. Mobility (transport): No\n3. local identity: No\n4. future of work: No\n5. land use: No'

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_data, test_data = train_test_split(client_message, test_size=0.2, random_state=42)

print("Train data length:", len(train_data))
print("Test data length:", len(test_data))

Train data length: 301
Test data length: 76


In [None]:
train_data

In [None]:
import pandas as pd

def parse_and_append(output, sentence, df):
    lines = output.strip().split('\n')
    classes = []

    for line in lines:
        # Only process lines starting with a number followed by a period
        if len(line) >= 2 and line[0].isdigit() and line[1] == '.':
            key_area, value = line[2:].split(':')
            key_area = key_area.strip()
            value = value.strip()

            if value.lower() == 'yes':
                classes.append(key_area)

    if not classes:
        classes.append('None')

    new_rows = pd.DataFrame({"sentence": [sentence] * len(classes), "class": classes})
    df = pd.concat([df, new_rows], ignore_index=True)

    return df

In [None]:
# Example of using the function
example_df = pd.DataFrame(columns=["sentence", "class"])
parse_example = parse_and_append(example, sen, example_df)
parse_example

Unnamed: 0,sentence,class
0,I think there should be many various sitting a...,


In [None]:
# Create an empty DataFrame with the desired column names
train_df = pd.DataFrame(columns=["sentence", "class"])

# Process each sentence in the list
for sen in train_data:
    nouns = get_nouns(sen)
    c_similarity = get_c_similarity(sen)
    output = gpt_ans(sen, nouns, c_similarity)
    # print(output)
    train_df = parse_and_append(output, sen, train_df)

# Display the DataFrame
train_df

In [None]:
# Create an empty DataFrame with the desired column names
test_df = pd.DataFrame(columns=["sentence", "class"])

# Process each sentence in the list
for sen in test_data:
    nouns = get_nouns(sen)
    c_similarity = get_c_similarity(sen)
    output = gpt_ans(sen, nouns, c_similarity)
    # print(output)
    test_df = parse_and_append(output, sen, train_df)

# Display the DataFrame
test_df

Here we split the data to features and labels for the train and the test 

In [None]:
# Create empty lists to hold the sentences and classes
sentences_train = []
classes_train = []

# Iterate over the rows in the DataFrame
for index, row in train_df.iterrows():
    # Add the sentence and class to their respective lists
    sentences_train.append(row["sentence"])
    classes_train.append(row["class"])

In [None]:
# Create empty lists to hold the sentences and classes
sentences_test = []
classes_test = []

# Iterate over the rows in the DataFrame
for index, row in test_df.iterrows():
    # Add the sentence and class to their respective lists
    sentences_test.append(row["sentence"])
    classes_test.append(row["class"])

#Build the model
In here we train the model where the labels are based on the chatGPT results.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Create TF-IDF embeddings for the sentences
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

# Encode the class labels
le = LabelEncoder()
y_train = le.fit_transform(classes_train)
y_test = le.transform(classes_test)

# Train the Adaboost classifier on the training data
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# Use the trained classifier to predict the classes for the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")