In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# bag of words  
# https://www.mygreatlearning.com/blog/bag-of-words/

# ngrams
# https://pypi.org/project/ngram/

# naive bayes
from sklearn.naive_bayes import MultinomialNB

In [7]:
import pyarrow.parquet as pq

# 读取 Parquet 文件
parquet_file = './data/train_data.parquet'
table = pq.read_table(parquet_file)

# 将 Parquet 表转换为 Pandas DataFrame
df = table.to_pandas()

csv_file = './data/output_file.csv'
df.to_csv(csv_file, index=False)

In [8]:
parquet_file = './data/test_data.parquet'
table = pq.read_table(parquet_file)

# 将 Parquet 表转换为 Pandas DataFrame
df = table.to_pandas()

csv_file = './data/test_data.csv'
df.to_csv(csv_file, index=False)

In [9]:
train_data = pd.read_csv('./data/train_data.csv')
train_data.head()

Unnamed: 0,prompt,prompt_id,messages,category
0,Please summarize the goals for scientists in t...,627a77298cf96a309aa35a62207c4164e22a66f6db7911...,[{'content': 'Please summarize the goals for s...,Summarize
1,Help write a letter of 100 -200 words to my fu...,7d443ef2cc3e34d9dc6ffcdf748c1d2a9880cd48be9c98...,[{'content': 'Help write a letter of 100 -200 ...,Generation
2,"Write a news style post about a fake event, li...",3c975b349494dea76dbbb9c01a2bb925a248efb8ca0944...,[{'content': 'Write a news style post about a ...,Generation
3,"Write a funny, short story about someone who w...",16d804af359db7823c457b7d82809eddaad9a5ea3c91ef...,"[{'content': 'Write a funny, short story about...",Generation
4,Write a letter to the Editor responding to the...,e9da2fa3a6d496c5a5ee500e58e5477362698aaa08e74c...,[{'content': 'Write a letter to the Editor res...,Rewrite


In [10]:
test_data = pd.read_csv('./data/test_data.csv')
test_data.head()

Unnamed: 0,prompt,prompt_id,messages,category
0,Aster is a chatbot who answers questions with ...,d6c011ffb1ff8a9abe9bd24caf3f9817454a1f054d5d0e...,[{'content': 'Aster is a chatbot who answers q...,Chat
1,What are 5 things I can do when it's raining i...,7682c850c4e5979cef0ad966b8feb119de2bd323bb4081...,"[{'content': ""What are 5 things I can do when ...",Brainstorm
2,Write several rhyming sentences from the persp...,45971bbe382b81644487690d2018bc3aec8c807f1be52a...,"[{'content': ""Write several rhyming sentences ...",Generation
3,What other strange and obscure music genres ca...,ccdce774f24ede2e799686276e8b72e8b08a4aebf6ba75...,[{'content': 'What other strange and obscure m...,Brainstorm
4,Write one paragraph about the Tlachihualtepetl...,c1f40447167a99977d745672c0c19cb2739b6f5208647a...,[{'content': 'Write one paragraph about the Tl...,Generation


In [11]:
# we will use the 'message' column as the feature, and the 'category' column as the target
X = train_data['messages']
y = train_data['category']
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# use tfidf to convert the text into a matrix of token counts
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# use Naive Bayes to train the model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# predict the category of the test data
y_pred_nb = nb.predict(X_test_tfidf)

# check the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_nb)

0.45052631578947366

In [12]:
# use SVM to predict the category of a message using tfidf

# use svm to train the model
from sklearn.svm import SVC
svm = SVC() # default C=1.0, kernel='rbf'
svm.fit(X_train_tfidf, y_train)

# predict the category of the test data
y_pred_svm = svm.predict(X_test_tfidf)

# check the accuracy of the model
accuracy_score(y_test, y_pred_svm)

0.7363157894736843

In [13]:
# use different values of C to fit linear kernel svm
svm = SVC(C=0.1, kernel='linear')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=1, kernel='linear')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=10, kernel='linear')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

0.5489473684210526
0.7878947368421053
0.8105263157894737


In [14]:
# use different values of C to fit rbf kernel svm
svm = SVC(C=0.1, kernel='rbf')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=1, kernel='rbf')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=10, kernel='rbf')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

0.4636842105263158
0.7363157894736843
0.7578947368421053


In [15]:
# use different values of C to fit poly kernel svm
svm = SVC(C=0.1,kernel='poly')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=1, kernel='poly')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

svm = SVC(C=10, kernel='poly')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svm))

0.45
0.4968421052631579
0.5194736842105263


In [20]:
# use neural network MLP to train the model
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
nn.fit(X_train_tfidf, y_train)

# predict the category of the test data
y_pred_nn = nn.predict(X_test_tfidf)    

# check the accuracy of the model
accuracy_score(y_test, y_pred_nn)
# print(accuracy_score(y_test, y_pred_nn))

0.7684210526315789

In [62]:
from tensorflow.keras.utils import to_categorical
Y_train = y_train.tolist()

label_mapping = {
    'Generation': 0,
    'Open QA': 1,
    'Brainstorm': 2,
    'Chat': 3,
    'Rewrite': 4,
    'Summarize': 5,
    'Coding': 6,
    'Classify': 7,
    'Closed QA': 8,
    'Extract': 9
}

# transfer validation data into one-hot data
Y_train = [label_mapping[label] for label in Y_train]
Y_train_onehot = to_categorical(Y_train, num_classes=10)
print(Y_train_onehot)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [95]:
# use RNN to train data
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
# from tensorflow.keras.optimizers.legacy import SGD
model = Sequential()
model.add(SimpleRNN(units=1000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=10, activation='sigmoid'))
# sgd = SGD(learning_rate=0.1)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
X_train_tfidf_3d = X_train_tfidf.toarray().reshape((X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1]))
model.fit(X_train_tfidf_3d, Y_train_onehot, epochs=5, validation_split=0.2, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x28a2916c0>

In [96]:
Y_test = [label_mapping[label] for label in y_test]
X_test_tfidf_3d = X_test_tfidf.toarray().reshape((X_test_tfidf.shape[0], 1, X_test_tfidf.shape[1]))
y_pred_rnn = model.predict(X_test_tfidf_3d)
# # print(y_pred_rnn)
predicted_classes = np.argmax(y_pred_rnn, axis=1)
# get the accuracy of prediction of RNN
print(accuracy_score(Y_test, predicted_classes))

0.8168421052631579


In [92]:
# use LSTM to train data
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, LSTM, Dropout
model = Sequential()
model.add(LSTM(units=1000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=10, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
X_train_tfidf_3d = X_train_tfidf.toarray().reshape((X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1]))
model.fit(X_train_tfidf_3d, Y_train_onehot, epochs=5, validation_split=0.2, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2aedc0af0>

In [93]:
# use LSTM model to predict
y_pred_lstm = model.predict(X_test_tfidf_3d)
# # print(y_pred_rnn)
predicted_classes = np.argmax(y_pred_lstm, axis=1)
# get the accuracy of prediction of LSTM
print(accuracy_score(Y_test, predicted_classes))

0.8210526315789474
