### Load train data

In [4]:
import json 
import pandas as pd
from tqdm.auto import tqdm

In [5]:
with open("data/train_data/intent_train_data.json","r",encoding="utf-8") as f:
    intent_train_data=json.load(f)

In [6]:
test=intent_train_data["building"][0]
test

{'query': ['where is the location of B1 building?',
  'what are the directions to B1 building?',
  'how can i get to B1 building?',
  'where is B1 building located?'],
 'intent': 'ask_building_location',
 'entities': [{'B1': 'building'}]}

In [7]:
def construct_data(data_item):
    temp={
        'intent': [data_item['intent']]*len(data_item['query']),  
        'query': data_item['query']
    }
    temp=pd.DataFrame(temp)
    return temp
train_df=pd.DataFrame()
for v in intent_train_data.values():
    for data_item in v:
        train_df=pd.concat([train_df,construct_data(data_item)],ignore_index=True)
train_df

Unnamed: 0,intent,query
0,ask_restaurant_location,Where is Xiayicheng located?
1,ask_restaurant_location,Can you give me directions to Xiayicheng?
2,ask_restaurant_location,How do I get to Xiayicheng restaurant?
3,ask_restaurant_location,What's the address of Xiayicheng?
4,ask_restaurant_location,where is Xiayicheng?
...,...,...
3199,greet_welcome,Welcome to XMUM! What's going on?
3200,greet_welcome,"Hi, I'm a new student at Xiamen University!"
3201,greet_welcome,"Hello, I am excited to join XMU today."
3202,greet_welcome,"What's up everyone, I just arrived in Xiamen U..."


In [8]:
train_df.to_csv("data/train_data/intent_train_data.csv",index=False)

### Train intent classifier model

#### Split data into train and test set

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X=train_df['query']
y=train_df['intent']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
len(X_train)

2563

In [11]:
n_train=y_train.value_counts()
n_train

intent
ask_handbook_info                482
ask_restaurant_time              452
ask_restaurant_recommendation    437
ask_restaurant_location          419
ask_facility_time                155
ask_facility_info                139
ask_facility_location            123
ask_building_location             67
greet_welcome                     58
greet_sorry                       56
ask_building_include              50
greet_hello                       45
greet_thanks                      41
greet_goodbye                     39
Name: count, dtype: int64

#### Construct pipeline

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [13]:
intent_classifier=Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), # 使用1-gram和2-gram可以捕捉更丰富的短语信息
    ('clf', LogisticRegression(random_state=42, C=5, solver='liblinear')) # C是正则化参数
])

In [14]:
intent_classifier.fit(X_train,y_train)

#### Test the model

In [15]:
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# Make predictions
y_pred = intent_classifier.predict(X_test)

In [17]:
# Print classification report
report=classification_report(y_test,y_pred,output_dict=True)
report=pd.DataFrame(report).T.round(3)
print("\nClassification Report:")
report


Classification Report:


Unnamed: 0,precision,recall,f1-score,support
ask_building_include,1.0,0.667,0.8,12.0
ask_building_location,0.786,0.647,0.71,17.0
ask_facility_info,0.857,0.667,0.75,36.0
ask_facility_location,0.73,0.794,0.761,34.0
ask_facility_time,0.812,0.897,0.852,29.0
ask_handbook_info,0.965,0.991,0.978,110.0
ask_restaurant_location,0.949,0.97,0.96,135.0
ask_restaurant_recommendation,0.935,0.99,0.962,101.0
ask_restaurant_time,0.982,0.991,0.987,112.0
greet_goodbye,0.8,0.364,0.5,11.0


In [18]:
report["n_train"]=n_train
report

Unnamed: 0,precision,recall,f1-score,support,n_train
ask_building_include,1.0,0.667,0.8,12.0,50.0
ask_building_location,0.786,0.647,0.71,17.0,67.0
ask_facility_info,0.857,0.667,0.75,36.0,139.0
ask_facility_location,0.73,0.794,0.761,34.0,123.0
ask_facility_time,0.812,0.897,0.852,29.0,155.0
ask_handbook_info,0.965,0.991,0.978,110.0,482.0
ask_restaurant_location,0.949,0.97,0.96,135.0,419.0
ask_restaurant_recommendation,0.935,0.99,0.962,101.0,437.0
ask_restaurant_time,0.982,0.991,0.987,112.0,452.0
greet_goodbye,0.8,0.364,0.5,11.0,39.0


In [23]:
report.to_latex("log/intent_report.tex")

在解释之前，我们先需要了解四个基本概念（混淆矩阵的四个元素）：

- True Positive (TP): 真阳性。真实意图是 ask_time，模型也正确预测为 ask_time。 (做对了)
- False Positive (FP): 假阳性。真实意图不是 ask_time (比如是find_location)，但模型错误地预测为 ask_time。 (误报了)
- False Negative (FN): 假阴性。真实意图是 ask_time，但模型错误地预测为其他意图 (比如find_location)。 (漏报了)
- True Negative (TN): 真阴性。真实意图不是 ask_time，模型也正确地没有预测为 ask_time。 (做对了)

$$Precision=\frac{TP}{TP+FP}$$

$$Recall=\frac{TP}{TP+FN}$$

$$F1=2*\frac{P*R}{P+R}$$

#### Save the model

In [21]:
import joblib

In [22]:
joblib.dump(intent_classifier, "data/trained_model/intent_classifier.joblib")

['data/trained_model/intent_classifier.joblib']