### Load train data

In [5]:
import json 
import pandas as pd
from tqdm.auto import tqdm

In [6]:
with open("data/train_data/intent_train_data.json","r",encoding="utf-8") as f:
    intent_train_data=json.load(f)

In [7]:
test=intent_train_data["building"][0]
test

{'query': ['Where is the B1 Activity Building?',
  'How to get to the B1 Activity Building?',
  'What is the location of the B1 Activity Building?',
  'Can you tell me where B1 Activity Building is?'],
 'intent': 'ask_building_location',
 'entities': [{'B1 Activity Building': 'building_name'}]}

In [8]:
def construct_data(data_item):
    temp={
        'intent': [data_item['intent']]*len(data_item['query']),  
        'query': data_item['query']
    }
    temp=pd.DataFrame(temp)
    return temp
train_df=pd.DataFrame()
for v in intent_train_data.values():
    for data_item in v:
        train_df=pd.concat([train_df,construct_data(data_item)],ignore_index=True)
train_df

Unnamed: 0,intent,query
0,ask_business_location,Where is KK便利店 located?
1,ask_business_location,Can you show me how to get to KK便利店?
2,ask_business_location,What's the address of KK便利店?
3,ask_business_location,How do I find KK便利店 on a map?
4,ask_business_location,Where is KIMS SALON located?
...,...,...
1580,ask_handbook_info,How do I get in touch with the university usin...
1581,ask_handbook_info,What are the clubs and societies available at ...
1582,ask_handbook_info,Can I join any club or society at XMUM handbook?
1583,ask_handbook_info,How many clubs and societies are listed in the...


In [9]:
train_df.to_csv("data/train_data/intent_train_data.csv",index=False)

### Train intent classifier model

#### Split data into train and test set

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X=train_df['query']
y=train_df['intent']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
len(X_train)

1268

In [29]:
n_train=y_train.value_counts()
n_train

intent
ask_restaurant_price             218
ask_restaurant_location          210
ask_restaurant_time              209
ask_restaurant_recommendation    205
ask_handbook_info                116
ask_facility_info                 61
ask_facility_location             54
ask_facility_time                 50
ask_business_info                 43
ask_business_time                 38
ask_business_location             37
ask_building_location             16
ask_building_include              11
Name: count, dtype: int64

#### Construct pipeline

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [13]:
intent_classifier=Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), # 使用1-gram和2-gram可以捕捉更丰富的短语信息
    ('clf', LogisticRegression(random_state=42, C=5, solver='liblinear')) # C是正则化参数
])

In [14]:
intent_classifier.fit(X_train,y_train)

#### Test the model

In [15]:
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# Make predictions
y_pred = intent_classifier.predict(X_test)

In [36]:
# Print classification report
report=classification_report(y_test,y_pred,output_dict=True)
report=pd.DataFrame(report).T.round(3)
print("\nClassification Report:")
report


Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
ask_building_include,1.0,1.0,1.0,3.0
ask_building_location,0.0,0.0,0.0,4.0
ask_business_info,0.8,0.444,0.571,9.0
ask_business_location,0.333,0.4,0.364,5.0
ask_business_time,0.833,0.357,0.5,14.0
ask_facility_info,0.692,0.818,0.75,11.0
ask_facility_location,0.867,0.722,0.788,18.0
ask_facility_time,0.875,0.438,0.583,16.0
ask_handbook_info,0.96,1.0,0.98,24.0
ask_restaurant_location,0.855,0.983,0.915,60.0


In [39]:
report["n_train"]=n_train
report

Unnamed: 0,precision,recall,f1-score,support,n_train
ask_building_include,1.0,1.0,1.0,3.0,11.0
ask_building_location,0.0,0.0,0.0,4.0,16.0
ask_business_info,0.8,0.444,0.571,9.0,43.0
ask_business_location,0.333,0.4,0.364,5.0,37.0
ask_business_time,0.833,0.357,0.5,14.0,38.0
ask_facility_info,0.692,0.818,0.75,11.0,61.0
ask_facility_location,0.867,0.722,0.788,18.0,54.0
ask_facility_time,0.875,0.438,0.583,16.0,50.0
ask_handbook_info,0.96,1.0,0.98,24.0,116.0
ask_restaurant_location,0.855,0.983,0.915,60.0,210.0


In [41]:
report.to_markdown("log/intent_report.md")

在解释之前，我们先需要了解四个基本概念（混淆矩阵的四个元素）：

- True Positive (TP): 真阳性。真实意图是 ask_time，模型也正确预测为 ask_time。 (做对了)
- False Positive (FP): 假阳性。真实意图不是 ask_time (比如是find_location)，但模型错误地预测为 ask_time。 (误报了)
- False Negative (FN): 假阴性。真实意图是 ask_time，但模型错误地预测为其他意图 (比如find_location)。 (漏报了)
- True Negative (TN): 真阴性。真实意图不是 ask_time，模型也正确地没有预测为 ask_time。 (做对了)

$$Precision=\frac{TP}{TP+FP}$$

$$Recall=\frac{TP}{TP+FN}$$

$$F1=2*\frac{P*R}{P+R}$$

#### Save the model

In [19]:
import joblib

In [20]:
joblib.dump(intent_classifier, "data/trained_model/intent_classifier.joblib")

['data/trained_model/intent_classifier.joblib']