In [0]:
## Action 1 - Xlearn - FM
# !pip install xlearn
import pandas as pd
import numpy as np
import xlearn as xl
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file, load_svmlight_file

#Add this to avoid std::logic_error
import os
os.environ['USER'] = 'test' 

def split_data(data,train_size):
  n = int(train_size * len(data))
  return data[:n],data[n:]

def convert_df_to_libsvm(df,f):
  X = df.drop(columns=['rating'])
  y = df.rating
  dump_svmlight_file(X, y, f)

## Step 1. 准备数据
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/datasets/ratings_small.txt",usecols=['userId','movieId','rating'])
train_data,test_data = split_data(df,0.7)
convert_df_to_libsvm(train_data,'small_train.txt')
convert_df_to_libsvm(test_data,'small_test.txt')


## Step 2. Train model
model = xl.create_fm()
model.setTrain('./small_train.txt')

param = {'task':'reg', 
         'lr':0.001, 
         'lambda':0.0002,
         'epoch':5000,
         'metric':'rmse',
         'fold':10}

model.setTXTModel("./model.txt")
model.fit(param, "./model.out")

model.setTest("./small_test.txt")
model.predict("./model.out", "./output.txt")

## Step 3. Evaluate model
y_pred = pd.read_csv("output.txt",header=None)
y_pred = np.array(y_pred).ravel()

y_val = pd.read_csv("small_test.txt",sep=' ',header=None)
y_val = y_val.drop(y_val.columns[1:],axis=1)
y_val = np.array(y_val).ravel()

from sklearn.metrics import mean_squared_error,r2_score
for i in range(10):
  print(y_pred[i]," : ",y_val[i])
print('MSE is : %lf' % mean_squared_error(y_pred,y_val))
print('R^2 is : %lf' % r2_score(y_pred,y_val))

3.5661300000000002  :  3.0
3.56057  :  5.0
3.5447900000000003  :  4.0
3.53675  :  4.0
3.53407  :  3.5
3.5312699999999997  :  4.0
3.5099400000000003  :  5.0
3.5069  :  5.0
3.5016  :  5.0
3.49044  :  3.5
MSE is : 1.086394
R^2 is : -39.164734


In [0]:
## Action 2 - DeepFM
#!pip install deepctr
import pandas as pd
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_feature_names

#数据加载
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/datasets/movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

# # 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique(),embedding_dim=10) for feature in sparse_features]
# print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# print(feature_names)

## 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

## 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', dnn_dropout=0.1)
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=50, verbose=True, validation_split=0.2)
## 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)

## 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)
r_score = round(r2_score(test[target].values, pred_ans), 4)
print("test R^2", r_score)

# for i in range(30):
#   print(test[target].values[i],pred_ans[i])

Epoch 1/50


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
test RMSE 1.3991425945914162
test R^2 -0.4039


In [234]:
## Action 3 - Word2Vec
# -*-coding: utf-8 -*-
# 对txt文件进行中文分词
import jieba.analyse
from jieba import posseg as psg
import os
from gensim import models
from gensim.test.utils import datapath
from gensim import utils
import smart_open
from time import time

## Step1. Prepare text data
def cut_words(file,out_name,stopwords,noun_only=False):
  with open(file, 'rb') as f:
      document = f.read()
      print(document)
      t1 = time()
      
      if noun_only:
      #Option 2 只读取人名词汇，用来做下面的fitler
        allow_pos = ('nr','PER') 
        document_cut = jieba.analyse.extract_tags(document, topK=100000, withWeight=False, allowPOS=allow_pos)
      else:
      #Option 1 读取所有词汇
        document_cut = jieba.cut(document)
      
      sentence_segment=[]
      for word in document_cut:
          if word not in stopwords:
              sentence_segment.append(word)
      
      #Write files  
      result = ' '.join(sentence_segment)
      result = result.encode('utf-8')
      with open(out_name, 'wb') as f2:
          f2.write(result)
      f.close()
      print(time()-t1)

stopwords = ['却说','不可','如此','不能','如何','于是','今日','次日','何不','何故','不如','正是','可以']
file = '/content/drive/My Drive/Colab Notebooks/datasets/three_kingdoms.txt'
# cut_words(file,'./three_kingdoms_segment.txt',stopwords,noun_only=False)  #This is the full text for training
# cut_words(file,'./three_kingdoms_segment_noun.txt',stopwords,noun_only=True)  #This is the nouns used for filtering


# ## Step 2. Build Model
## A memory-friendly way to read large corpus
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('/content/three_kingdoms_segment.txt')
        for line in smart_open.open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

# sentences = MyCorpus()
# model = models.Word2Vec(
#     sentences=sentences,
#     size=200,  #Embeddding Dimension
#     window = 20, 
#     min_count=2,
#     compute_loss=True,
#     sg=0, #0 = CBOW, 1 = Skip-Gram
#     hs=0, 
#     negative=5,
#     workers=4,
#     seed=42)
# model.save('three-kingdoms')

## Load Model
### 张飞+曹操-刘备 != 曹营猛将，感觉不太好###
model = models.Word2Vec.load('three-kingdoms')
print('model train loss: ',model.get_latest_training_loss())
print('出现最多的词汇：',model.wv.index2entity[:100]) #print the most frequent 100 words
print(model.wv.similarity('曹操','玄德')) #print similarity between words
print('与袁绍最近似10个词：',model.wv.most_similar(positive=['袁绍'],topn=10)) #袁绍？？
print('\n','#'*25,'全部词汇的相似词汇 - 看起来很奇怪，逻辑不清晰','#'*25)
print('曹操+刘备-张飞:',model.wv.most_similar(positive=['曹操','刘备'],negative=['张飞'])) #曹操+刘备-张飞
print('张飞+曹操-刘备:',model.wv.most_similar(positive=['张飞','曹操'],negative=['刘备'])) #张飞+曹操-刘备 != 曹营猛将?
print('孔明+孙权-曹操:',model.wv.most_similar(positive=['孔明','孙权'],negative=['曹操'])) #孔明+孙权-曹操 != 东吴谋士？


### 如果我们仅看人名，是不是能得到张飞-刘备+曹操=魏将？ 试试看 ###
print('\n','#'*25,'人名only的相似词汇-看起来清楚很多','#'*25)

##人词fitler
def name_filter(words):
  file = smart_open.open('/content/three_kingdoms_segment_noun.txt')
  nouns_filter = next(file).split(' ')
  words_filtered = [[word,weight] for word,weight in words if word in nouns_filter]
  return words_filtered

words = model.wv.most_similar(positive=['张飞','曹操'],negative=['刘备'],topn=100)
words_filtered = name_filter(words)
print('张飞+曹操-刘备: - 魏延，庞德，黄忠，夏侯都是还可以的答案')
print(words_filtered)

words = model.wv.most_similar(positive=['孔明','孙权'],negative=['曹操'],topn=100)
words_filtered = name_filter(words)
print('孔明+孙权-曹操: - 这个结果还挺好，东吴/其他势力军师类人物排名靠前')
print(words_filtered)




  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


model train loss:  648674.3125
出现最多的词汇： ['曹操', '孔明', '将军', '玄德', '关公', '丞相', '二人', '荆州', '孔明曰', '玄德曰', '张飞', '商议', '主公', '军士', '吕布', '左右', '军马', '引兵', '刘备', '大喜', '孙权', '云长', '天下', '赵云', '东吴', '不敢', '魏兵', '陛下', '司马懿', '人马', '都督', '周瑜', '一人', '不知', '汉中', '众将', '只见', '后主', '袁绍', '蜀兵', '马超', '大叫', '上马', '魏延', '此人', '先主', '太守', '天子', '后人', '背后', '黄忠', '一面', '城中', '忽报', '大军', '先生', '然后', '先锋', '夫人', '诸葛亮', '姜维', '赶来', '原来', '令人', '江东', '徐州', '忽然', '下马', '喊声', '因此', '成都', '百姓', '未知', '大败', '大事', '之后', '一军', '不见', '起兵', '马岱', '接应', '引军', '军中', '进兵', '庞德', '孟获', '大怒', '心中', '正文', '以为', '分节', '阅读', '大惊', '不得', '刘表', '下文', '追赶', '粮草', '一声', '分解']
0.9999775
与袁绍最近似10个词： [('曹操', 0.9999960660934448), ('大喜', 0.9999946355819702), ('不肯', 0.9999943971633911), ('然后', 0.9999943971633911), ('诸葛亮', 0.9999942779541016), ('商议', 0.9999942183494568), ('朝廷', 0.9999938607215881), ('天子', 0.9999935626983643), ('之计', 0.9999934434890747), ('以为', 0.9999934434890747)]

 ######################### 全部词汇的相似词汇 - 看起来很奇怪，逻辑不清

  if np.issubdtype(vec.dtype, np.int):
