In [202]:
import pandas as pd
train_data = pd.read_csv('train.csv', sep='\t', header=None)
test_data = pd.read_csv('test.csv', sep='\t', header=None)
train_data.columns = ["text","label"]
test_data.columns = ["text"]

In [203]:
cn_stopwords = ' '.join(pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0])

In [204]:
import jieba
def cut_by_jieba(sentence):
    segs = list(jieba.cut(sentence))
    filtered_segs = [x for x in segs if x not in cn_stopwords]
    seg_string = " ".join(filtered_segs)
    return seg_string

In [205]:
train_data["text_segs"] = train_data["text"].apply(cut_by_jieba)
test_data["text_segs"] = test_data["text"].apply(cut_by_jieba)

In [5]:
class MyDataset(object):
    def __init__(self,data):
        self.data = data
        
    def __iter__(self):
        for item in self.data["text_segs"]:
            yield item.split()

In [7]:
import gensim
sentences = MyDataset(train_data)
model_50 = gensim.models.Word2Vec(sentences=sentences,vector_size=50)

In [18]:
model_70 = gensim.models.Word2Vec(sentences=sentences,vector_size=70)

In [8]:
model_100 = gensim.models.Word2Vec(sentences=sentences,vector_size=100)

In [9]:
model_120 = gensim.models.Word2Vec(sentences=sentences,vector_size=120)

In [16]:
for index, word in enumerate(model_50.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model_50.wv.index_to_key)} is {word}")

word #0/1690 is 播放
word #1/1690 is 一个
word #2/1690 is 想
word #3/1690 is 月
word #4/1690 is ，
word #5/1690 is 听
word #6/1690 is 明天
word #7/1690 is 号
word #8/1690 is 找
word #9/1690 is 提醒


In [192]:
import numpy as np
#平均句向量
def sent_vec(sent,w2v=None):
    sent_segs = sent.split()
    #对付NaN
    mean_sent_vec = np.zeros((w2v.vector_size,),dtype="float32")+0.00001
    for seg in sent_segs:
        if seg in w2v:
            mean_sent_vec += w2v[seg]
    #对付inf,因为有的句子分词+去停词后为空字符串
    if len(sent_segs) == 0:
        mean_sent_vec = mean_sent_vec
    else:
        mean_sent_vec = mean_sent_vec/len(sent_segs)
    mean_sent_vec = mean_sent_vec.tolist()
    return mean_sent_vec

In [156]:
#步骤3
train_data["text_w2v_50"] = train_data["text_segs"].apply(sent_vec,w2v=model_50.wv)
train_data["text_w2v_70"] = train_data["text_segs"].apply(sent_vec,w2v=model_70.wv)
train_data["text_w2v_100"] = train_data["text_segs"].apply(sent_vec,w2v=model_100.wv)
train_data["text_w2v_120"] = train_data["text_segs"].apply(sent_vec,w2v=model_120.wv)

In [157]:
test_data["text_w2v_50"] = test_data["text_segs"].apply(sent_vec,w2v=model_50.wv)
test_data["text_w2v_70"] = test_data["text_segs"].apply(sent_vec,w2v=model_70.wv)
test_data["text_w2v_100"] = test_data["text_segs"].apply(sent_vec,w2v=model_100.wv)
test_data["text_w2v_120"] = test_data["text_segs"].apply(sent_vec,w2v=model_120.wv)

In [139]:
def remove_nan(X,Y,vector_size=50):
    removed_row = []
    row_count = 0
    for row in X:
        zero_count = 0
        for col in row:
            if col == 0.0:
                zero_count += 1
        if zero_count == vector_size:
            removed_row.append(row_count)
        row_count += 1
    start = 0
    for _id in removed_row:
        X = np.delete(X,_id-start,axis=0)
        Y = np.delete(Y,_id-start,axis=0)
        start += 1
    return X,Y

In [225]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import tree

In [134]:
lr_50 = LogisticRegression()
lr_70 = LogisticRegression()
lr_100 = LogisticRegression()
lr_120 = LogisticRegression()

In [163]:
X_50 = np.array(train_data["text_w2v_50"].tolist()).reshape(len(train_data["text_w2v_50"]),50)
Y_50 = train_data["label"].to_numpy()
print(X_50.shape,Y_50.shape)
# X,Y = remove_nan(X,Y,vector_size=50)
# print(X.shape,Y.shape)
lr_50.fit(X_50,Y_50)

(12100, 50) (12100,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [166]:
X_pred_50 = np.array(test_data["text_w2v_50"].tolist()).reshape(len(test_data["text_w2v_50"]),50)
Y_pred_50 = lr_50.predict(X_pred_50)

In [165]:
X_70 = np.array(train_data["text_w2v_70"].tolist()).reshape(len(train_data["text_w2v_70"]),70)
Y_70 = train_data["label"].to_numpy()
print(X_70.shape,Y_70.shape)
# X,Y = remove_nan(X,Y,vector_size=50)
# print(X.shape,Y.shape)
lr_70.fit(X_70,Y_70)

(12100, 70) (12100,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [167]:
X_pred_70 = np.array(test_data["text_w2v_70"].tolist()).reshape(len(test_data["text_w2v_70"]),70)
Y_pred_70 = lr_70.predict(X_pred_70)

In [168]:
X_100 = np.array(train_data["text_w2v_100"].tolist()).reshape(len(train_data["text_w2v_100"]),100)
Y_100 = train_data["label"].to_numpy()
print(X_100.shape,Y_100.shape)
# X,Y = remove_nan(X,Y,vector_size=50)
# print(X.shape,Y.shape)
lr_100.fit(X_100,Y_100)

(12100, 100) (12100,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [169]:
X_pred_100 = np.array(test_data["text_w2v_100"].tolist()).reshape(len(test_data["text_w2v_100"]),100)
Y_pred_100 = lr_100.predict(X_pred_100)

In [170]:
X_120 = np.array(train_data["text_w2v_120"].tolist()).reshape(len(train_data["text_w2v_120"]),120)
Y_120 = train_data["label"].to_numpy()
print(X_120.shape,Y_120.shape)
# X,Y = remove_nan(X,Y,vector_size=50)
# print(X.shape,Y.shape)
lr_120.fit(X_120,Y_120)

(12100, 120) (12100,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [171]:
X_pred_120 = np.array(test_data["text_w2v_120"].tolist()).reshape(len(test_data["text_w2v_120"]),120)
Y_pred_120 = lr_120.predict(X_pred_120)

In [177]:
test_data["Y_pred_50"] = Y_pred_50
test_data["Y_pred_70"] = Y_pred_70
test_data["Y_pred_100"] = Y_pred_100
test_data["Y_pred_120"] = Y_pred_120

In [178]:
test_data

Unnamed: 0,text,text_segs,text_w2v_50,text_w2v_70,text_w2v_100,text_w2v_120,Y_pred_50,Y_pred_70,Y_pred_100,Y_pred_120
0,回放CCTV2的消费主张,回放 CCTV2 消费,"[0.036894213408231735, -0.09735322743654251, 0...","[0.15702958405017853, -0.02909264713525772, -0...","[-0.037246935069561005, 0.11769197136163712, 0...","[0.009291189722716808, 0.10824751853942871, 0....",Video-Play,Video-Play,Video-Play,Video-Play
1,给我打开玩具房的灯,打开 玩具 房 灯,"[0.053586602210998535, -0.09181060642004013, 0...","[0.1797219067811966, -0.005513092502951622, -0...","[-0.0409448966383934, 0.1435110867023468, 0.02...","[0.013976670801639557, 0.14177896082401276, 0....",HomeAppliance-Control,HomeAppliance-Control,HomeAppliance-Control,HomeAppliance-Control
2,循环播放赵本山的小品相亲来听,循环 播放 赵本山 小品 相亲 听,"[0.06058083102107048, -0.027346991002559662, -...","[0.24930541217327118, 0.07945393770933151, -0....","[-0.030159713700413704, 0.1654077023267746, -0...","[-0.01383722573518753, 0.12671895325183868, 0....",Music-Play,Music-Play,Music-Play,Music-Play
3,15号上午10点带孩子去海洋馆的行程帮我制定下。,15 号 上午 10 点带 孩子 海洋馆 行程 制定 。,"[0.10507967323064804, -0.3613370656967163, 0.1...","[0.30579033493995667, -0.15630006790161133, 0....","[-0.11538441479206085, 0.2785932123661041, 0.1...","[0.05573185160756111, 0.32761403918266296, 0.0...",Alarm-Update,Alarm-Update,Alarm-Update,Alarm-Update
4,把智能扫地机器人关掉,智能 扫地 机器人 关掉,"[0.07013531774282455, -0.10136409103870392, 0....","[0.20975342392921448, -0.004705200903117657, -...","[-0.05659368261694908, 0.1639595478773117, 0.0...","[0.012684157118201256, 0.1588975340127945, 0.0...",HomeAppliance-Control,HomeAppliance-Control,HomeAppliance-Control,HomeAppliance-Control
...,...,...,...,...,...,...,...,...,...,...
2995,是否能找一首2019年的抖音歌曲播放下呢,找 一首 2019 抖音 歌曲 播放,"[0.048687893897295, -0.06284000724554062, -0.0...","[0.4206104576587677, 0.10067712515592575, -0.3...","[-0.0762188658118248, 0.25765904784202576, 0.0...","[0.00036368612200021744, 0.2149602621793747, 0...",Music-Play,Music-Play,Music-Play,Music-Play
2996,下午三点有哪个台放大话西游吗帮我看下,下午 三点 台放 大话西游,"[0.10081005096435547, -0.16935856640338898, 0....","[0.19554200768470764, -0.038734305649995804, -...","[-0.051506027579307556, 0.166267529129982, 0.0...","[0.01222950592637062, 0.16824771463871002, 0.0...",Alarm-Update,Alarm-Update,Alarm-Update,Alarm-Update
2997,随机播放一首古筝弹奏的曲子可以吗,随机 播放 一首 古筝 弹奏 曲子,"[0.04146405681967735, -0.005537357181310654, -...","[0.304779976606369, 0.108954519033432, -0.2450...","[-0.041685521602630615, 0.16562624275684357, 5...","[0.0006796050001867115, 0.1478983461856842, 0....",Music-Play,Music-Play,Music-Play,Music-Play
2998,美食纪录片螃蟹的征途给我找一下,美食 纪录片 螃蟹 征途 找,"[0.04323325678706169, -0.07966790348291397, 0....","[0.21265466511249542, 0.012709632515907288, -0...","[-0.047112684696912766, 0.15232017636299133, 0...","[0.007619432173669338, 0.12938520312309265, 0....",Video-Play,Video-Play,Video-Play,Video-Play


In [179]:
#LR输出结果文件
#在https://competition.coggle.club/上的结果是0.531000
with open("results\\w2v\\LR\\50.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_50"]+"\n")
#在https://competition.coggle.club/上的结果是0.533333
with open("results\\w2v\\LR\\70.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_70"]+"\n")
#在https://competition.coggle.club/上的结果是0.519000
with open("results\\w2v\\LR\\100.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_100"]+"\n")
#在https://competition.coggle.club/上的结果是0.524667
with open("results\\w2v\\LR\\120.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_120"]+"\n")

In [180]:
svm_50 = LinearSVC(C=1.0)
svm_70 = LinearSVC(C=1.0)
svm_100 = LinearSVC(C=1.0)
svm_120 = LinearSVC(C=1.0)

In [181]:
svm_50.fit(X_50,Y_50)

LinearSVC()

In [182]:
svm_70.fit(X_70,Y_70)

LinearSVC()

In [183]:
svm_100.fit(X_100,Y_100)

LinearSVC()

In [184]:
svm_120.fit(X_120,Y_120)

LinearSVC()

In [185]:
Y_pred_50_svm = svm_50.predict(X_pred_50)

In [187]:
Y_pred_70_svm = svm_70.predict(X_pred_70)

In [188]:
Y_pred_100_svm = svm_100.predict(X_pred_100)

In [189]:
Y_pred_120_svm = svm_120.predict(X_pred_120)

In [190]:
test_data["Y_pred_50_SVM"] = Y_pred_50_svm
test_data["Y_pred_70_SVM"] = Y_pred_70_svm
test_data["Y_pred_100_SVM"] = Y_pred_100_svm
test_data["Y_pred_120_SVM"] = Y_pred_120_svm

In [191]:
#SVM输出结果文件
#在https://competition.coggle.club/上的结果是0.564333
with open("results\\w2v\\SVM\\50.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_50_SVM"]+"\n")
#在https://competition.coggle.club/上的结果是0.564000
with open("results\\w2v\\SVM\\70.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_70_SVM"]+"\n")
#在https://competition.coggle.club/上的结果是0.563667
with open("results\\w2v\\SVM\\100.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_100_SVM"]+"\n")
#在https://competition.coggle.club/上的结果是0.568000
with open("results\\w2v\\SVM\\120.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_120_SVM"]+"\n")

In [193]:
dt_50 = tree.DecisionTreeClassifier(criterion='gini', 
                                         max_depth=None,
                                         min_samples_leaf=1,
                                         ccp_alpha=0.0)
dt_70 = tree.DecisionTreeClassifier(criterion='gini', 
                                         max_depth=None,
                                         min_samples_leaf=1,
                                         ccp_alpha=0.0)
dt_100 = tree.DecisionTreeClassifier(criterion='gini', 
                                         max_depth=None,
                                         min_samples_leaf=1,
                                         ccp_alpha=0.0)
dt_120 = tree.DecisionTreeClassifier(criterion='gini', 
                                         max_depth=None,
                                         min_samples_leaf=1,
                                         ccp_alpha=0.0)

In [194]:
dt_50.fit(X_50,Y_50)

DecisionTreeClassifier()

In [195]:
dt_70.fit(X_70,Y_70)

DecisionTreeClassifier()

In [196]:
dt_100.fit(X_100,Y_100)

DecisionTreeClassifier()

In [197]:
dt_120.fit(X_120,Y_120)

DecisionTreeClassifier()

In [198]:
Y_pred_50_dt = dt_50.predict(X_pred_50)
Y_pred_70_dt = dt_70.predict(X_pred_70)
Y_pred_100_dt = dt_100.predict(X_pred_100)
Y_pred_120_dt = dt_120.predict(X_pred_120)

In [199]:
test_data["Y_pred_50_DT"] = Y_pred_50_dt
test_data["Y_pred_70_DT"] = Y_pred_70_dt
test_data["Y_pred_100_DT"] = Y_pred_100_dt
test_data["Y_pred_120_DT"] = Y_pred_120_dt

In [200]:
#DT输出结果文件
#在https://competition.coggle.club/上的结果是0.564333
with open("results\\w2v\\DT\\50.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_50_DT"]+"\n")
#在https://competition.coggle.club/上的结果是0.565000
with open("results\\w2v\\DT\\70.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_70_DT"]+"\n")
#在https://competition.coggle.club/上的结果是0.557333
with open("results\\w2v\\DT\\100.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_100_DT"]+"\n")
#在https://competition.coggle.club/上的结果是0.576667
with open("results\\w2v\\DT\\120.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_120_DT"]+"\n")

In [None]:
'''
综上实验结果：
    1. 词向量的维度会影响到模型精度吗？会的，测试了50,70,100,120维度的词向量，会影响到结果。LR的结果显示70的维度最好，SVM和DT的结果是120的维度最好
    2. 词向量编码后使用树模型和LR，谁的精度高，为什么？在我的结果中，树模型的结果比LR好。其原因我认为是因为，逻辑回归更适合解决线性问题，但是文本的分类
    一般来说不是线性的，因此使用树模型或者SVM效果更佳。
'''


In [219]:
#使用开源的word2vec词向量：来源https://github.com/Flywolfs/Chinese-Word-Vectors
word2vec_path = "word2vec\\sgns.zhihu.word\\sgns.zhihu.word"
zhi_vec_depth = 300

In [210]:
import numpy as np
zhihu_w2v = {}
with open(word2vec_path,"r",encoding="utf-8") as f:
    for line in f.readlines()[1:]:
        line = line.rstrip()
        seps = line.split(" ")
        word = seps[0]
        vec = seps[1:]
        zhihu_w2v[seps[0]] = np.array(vec,dtype=float)

In [222]:
import numpy as np
#平均句向量
def sent_vec_opensource(sent,w2v=None):
    sent_segs = sent.split()
    #对付NaN
    mean_sent_vec = np.zeros((300,),dtype=float)+0.00001
    for seg in sent_segs:
        if seg in w2v:
            mean_sent_vec += w2v[seg]
    #对付inf,因为有的句子分词+去停词后为空字符串
    if len(sent_segs) == 0:
        mean_sent_vec = mean_sent_vec
    else:
        mean_sent_vec = mean_sent_vec/len(sent_segs)
    mean_sent_vec = mean_sent_vec.tolist()
    return mean_sent_vec

In [223]:
train_data["text_w2v_zhihu"] = train_data["text_segs"].apply(sent_vec_opensource,w2v=zhihu_w2v)

In [224]:
test_data["text_w2v_zhihu"] = test_data["text_segs"].apply(sent_vec_opensource,w2v=zhihu_w2v)

In [226]:
svm_zhihu = LinearSVC(C=1.0)

In [227]:
X_300 = np.array(train_data["text_w2v_zhihu"].tolist()).reshape(len(train_data["text_w2v_zhihu"]),300)
Y_300 = train_data["label"].to_numpy()
print(X_300.shape,Y_300.shape)
# X,Y = remove_nan(X,Y,vector_size=50)
# print(X.shape,Y.shape)
svm_zhihu.fit(X_300,Y_300)
X_pred_300 = np.array(test_data["text_w2v_zhihu"].tolist()).reshape(len(test_data["text_w2v_zhihu"]),300)
Y_pred_300_svm_zhihu = svm_zhihu.predict(X_pred_300)

(12100, 300) (12100,)


In [228]:
test_data["Y_pred_300_SVM_ZHIHU"] = Y_pred_300_svm_zhihu

In [229]:
#在https://competition.coggle.club/上的结果是0.747000
with open("results\\w2v\\SVM\\300.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["Y_pred_300_SVM_ZHIHU"]+"\n")