In [None]:
import nltk
from nltk.chat.util import Chat, reflections
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import pandas as pd
import os,sys
import jieba

pd.set_option('max_colwidth', 1000)

In [None]:
#載入資料
data = {'你好，請問有問題能在這裡詢問嗎':'迎賓|引導',
        '請問我想購買加值方案一直無法付費':'企業問題',
        '好像是認證的事情':'企業問題',
        '是你們昨天打給我的喔公司統編 8004001':'聯繫客服',
        '昨天你們有打給我':'聯繫客服',
        '我想在台中市東山路附近找工作':'求職問題',
        '你好~':'迎賓|引導',
        '可以麻煩客服打給我嗎':'聯繫客服',
        '嗨':'迎賓|引導',
        '請問刊登職缺是要用電腦才可以嗎':'企業問題',
         '您好':'迎賓|引導',
        '無法看到求職者的聯絡資訊':'企業問題',
        '要求職':'求職問題',
        '請暫停徵才刊登.謝謝':'企業問題',
        '我要開啟徵才職缺':'企業問題',
        '您好 請教一個問題':'迎賓|引導',
        '你好想詢問有貨運司機的工作嗎？ 桃園區 無職業駕照':'求職問題',
        '你好想詢問有貨運司機的工作嗎？':'求職問題',
        '我想找工作':'求職問題',
        '您好我要更改公司資料':'企業問題',
        '我剛已經電話進去詢問了 感謝 用好了':'聯繫客服',
        '您好現在有什麼徵才優惠嗎?':'付款費用|活動方案',
        '索亞精品有限公司請更改公司地址為台北市中正區濟南路一段71號更改電話為（02）26891006':'企業問題',
        '你好':'迎賓|引導',
        '客服':'聯繫客服',
        '您好  我發票要開統編':'付款費用|活動方案',
        '我要先關閉暫停徵才職缺':'企業問題',
        '你好~我們公司的518合約已到期 不再續約應如何處理':'企業問題',
        '您好':'迎賓|引導',
        '已經線上刷卡繳費了 但還不能開通':'付款費用|活動方案',
        '公司已搬遷 要如何更改公司資料':'企業問題',
        '你好我想要詢問刊登價格':'付款費用|活動方案',
        '妳好，我們人找到了，可以暫停徵人嗎？':'企業問題',
        '嗨':'迎賓|引導',
        '我要使用徵才系統':'企業問題',
        '您好，請問企業登入的相關文件是否可以從這邊傳送資料可貴公司，還是必須透過mail呢?':'企業問題',
        '儲值後要怎麼開通':'付款費用|活動方案',
        '可以透過你們幫我開通嗎':'付款費用|活動方案',
        '你好，我剛剛有儲值':'付款費用|活動方案',
        '請問如何關閉職缺':'企業問題'}
df = pd.DataFrame(list(data.items()),columns=['quote','category'])

In [None]:
# 檢視資料
df.head(10)

Unnamed: 0,quote,category
0,你好，請問有問題能在這裡詢問嗎,迎賓|引導
1,請問我想購買加值方案一直無法付費,企業問題
2,好像是認證的事情,企業問題
3,是你們昨天打給我的喔公司統編 8004001,聯繫客服
4,昨天你們有打給我,聯繫客服
5,我想在台中市東山路附近找工作,求職問題
6,你好~,迎賓|引導
7,可以麻煩客服打給我嗎,聯繫客服
8,嗨,迎賓|引導
9,請問刊登職缺是要用電腦才可以嗎,企業問題


In [None]:
nltk.download('averaged_perceptron_tagger')
# 模式匹配
patterns = [
    (r'你好', '嗨！'),
    (r'(.*)天气如何', '我不是气象专家，但你可以查看天气预报。')
]

# 分词
text = "NLTK is a powerful library for AI chat bot"
important_words = nltk.word_tokenize(text)
print(important_words)

# 句子拆分
text = """The sun rose in the clear blue sky, casting its warm rays upon the vibrant green landscape.
Birds chirped their melodious tunes as a gentle breeze rustled the leaves of the trees."""
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)

# 词性标注
words = word_tokenize("NLTK is a powerful library for AI chat bot")
tags = pos_tag(words)
print(tags)

['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'AI', 'chat', 'bot']
The sun rose in the clear blue sky, casting its warm rays upon the vibrant green landscape.
Birds chirped their melodious tunes as a gentle breeze rustled the leaves of the trees.
[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('AI', 'NNP'), ('chat', 'WP'), ('bot', 'VBD')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def cutProcess(sting):
    result = jieba.lcut(sting)
    result = " ".join(result)

    return result

df['quote'] = df['quote'].apply(cutProcess)

df.head(5)

Unnamed: 0,quote,category
0,你好 ， 請 問有 問題 能 在 這裡 詢問 嗎,迎賓|引導
1,請問 我 想 購買 加值 方案 一直 無法 付費,企業問題
2,好像 是 認證 的 事情,企業問題
3,是 你 們 昨天 打給 我 的 喔 公司 統編 8004001,聯繫客服
4,昨天 你們 有 打給 我,聯繫客服


In [None]:
data = df
training_documents = data['quote'].values.astype('U')
labels = data['category'].values.astype('U')

#切分训练集和测试集，分为70%训练集，30%测试集
X_train, X_test, y_train, y_test = train_test_split(training_documents, labels, test_size=0.3, random_state=42)


vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b') # token_pattern='(?u)\\b\\w+\\b' 單字也計入
x_train = vectorizer.fit_transform(X_train)

# train
classifier = MultinomialNB (alpha=0.01) # change model type here
classifier.fit(x_train, y_train)

In [None]:
x_test = vectorizer.transform(X_test)
classifier.score(x_test,y_test)

0.75

In [None]:
def predict(raw_queries,n_top=1):
    raw_queries = [cutProcess(s) for s in raw_queries]
#     print(raw_queries)

    queries = vectorizer.transform(raw_queries)
    predict =  classifier.predict_proba(queries).tolist()
    predict = [{k:round(v,4) for k,v in zip(classifier.classes_[:3],qa[:3])} for qa in predict]
    predict = [ sorted(dictt.items(), key=lambda d: d[1], reverse=True) for dictt in predict]
    return predict

In [None]:
example = ['我有問題','修改公司資料','我想在台中市東山路附近找小雞上工上的工作','要怎麼變更公司電話','您好應徵者為何看不到我們需要出差的項目']

lists = predict(example)

for index,qa in enumerate(lists):
    print("question:",example[index])
    print("anser:", qa)

    print()

question: 我有問題
anser: [('付款費用|活動方案', 0.9278), ('求職問題', 0.0122), ('企業問題', 0.0005)]

question: 修改公司資料
anser: [('企業問題', 0.9953), ('付款費用|活動方案', 0.0001), ('求職問題', 0.0001)]

question: 我想在台中市東山路附近找小雞上工上的工作
anser: [('求職問題', 1.0), ('付款費用|活動方案', 0.0), ('企業問題', 0.0)]

question: 要怎麼變更公司電話
anser: [('企業問題', 0.9602), ('付款費用|活動方案', 0.0393), ('求職問題', 0.0)]

question: 您好應徵者為何看不到我們需要出差的項目
anser: [('企業問題', 0.9932), ('付款費用|活動方案', 0.0009), ('求職問題', 0.0006)]



In [None]:
txt = input()
predict([txt])[0]

要怎麼變更公司電話


[('企業問題', 0.9602), ('付款費用|活動方案', 0.0393), ('求職問題', 0.0)]

In [None]:
raw_queries = ['請問我有點忘了職缺再刊登期間有關閉的次數限制嗎',
               '請把我過期的應徵職缺關閉',
               '想刊登徵人才，能免費試用看看嗎?',
               '我想改公司電話',
               '請問，我最近在廣告看到的訊息，問他們都說不缺人，是系統自動更新，我不知道要怎麼找',
               '我剛剛有繳費，為什麼帳號還沒開通']

In [None]:
""" 分詞優化 """

# 1.加入自定義字典
dic  = '../dict.txt'
jieba.set_dictionary(dic)

# 2.停用詞過濾
stop_word = []
for word in result:
    if word in stop_word:
        result.remove(word)

Exception: jieba: file does not exist: /dict.txt

In [None]:
model_name = "gpt2"  # 或者使用"gpt2-medium"、"gpt2-large"等更大的模型
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
input_text = input()
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

sentiment_scores = sid.polarity_scores(input_text)

# 输出情感极性得分
print("Positive score:", sentiment_scores['pos'])
print("Negative score:", sentiment_scores['neg'])
print("Neutral score:", sentiment_scores['neu'])


hello


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


hello, PA)

The city of Philadelphia is considering a $1.5 million renovation of the former Union Station, which was demolished in the 1970s.

The project is expected to cost $1.5 million, according to the
Positive score: 0.0
Negative score: 0.0
Neutral score: 1.0


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
import re
import json
import random

import nltk
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def stop_words_and_tokenize(text:str):
    words=word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word.lower() not in punctuation]
    ps = PorterStemmer()
    stem_word= map(ps.stem,filtered_words)
    return list(stem_word)

def load_data(file_path:str):
    data= open(file_path,"r")
    data: dict = json.load(data)
    data=data["all_data"]
    return data

def unknown():
    response = ["Could you please re-phrase that? ",
                "Sounds about right.",
                "What does that mean?"][
        random.randrange(3)]
    newknowledge= input("What does it mean? can you teach me: ")
    respon =input("How can I responses : ")
    new_words=stop_words_and_tokenize(prompt_text)
    item={"tag":[newknowledge],"responses": [respon],"patterns":new_words,"keyword":[]}
    config = json.loads(open('data2.json').read())
    config["all_data"].append(item)
    with open('data2.json','w') as f:
        f.write(json.dumps(config,indent=2))
    return("thank you")

def message_probability(user_message, recognised_words, single_response=False, required_words=[]):
    message_certainty = 0
    has_required_words = True

    #計算每條預定義訊息中存在的單字數
    for word in user_message:
        if word in recognised_words:
            message_certainty += 1

    # 計算用戶訊息中已識別單字的百分比
    percentage = float(message_certainty) / float(len(recognised_words))

    #檢查字串中是否包含單字
    for word in required_words:
        if word not in user_message:
            has_required_words = False
            break

    if has_required_words or single_response:
        return int(percentage * 100)
    else:
        return 0


def check_all_messages(message):
    highest_prob_list = {}

    #過濾後將其添加到字典中
    def response(bot_response, list_of_words, single_response=False, required_words=[]):
        nonlocal highest_prob_list
        highest_prob_list[bot_response] = message_probability(message, list_of_words, single_response, required_words)

    n=load_data("data2.json")
    for i in n:
        response(i["responses"][0], i["patterns"], required_words= i["keyword"])

    best_match = max(highest_prob_list, key=highest_prob_list.get)
    print(highest_prob_list)
    print(f'Best match = {best_match} | Score: {highest_prob_list[best_match]}')

    return unknown() if highest_prob_list[best_match] < 1 else best_match

#用於獲取回應
def get_response(user_input):
    split_message = re.split(r'\s+|[,;?!.-]\s*', user_input.lower())
    response = check_all_messages(split_message)
    print(split_message)
    return response

#進行訓練
while True:
    prompt_text=input("You:")
    print(str('Bot: ' + get_response(prompt_text)))

You:hello


FileNotFoundError: [Errno 2] No such file or directory: 'data2.json'