In [2]:
## import pandas as pd
import csv
import os
import re
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 
%matplotlib inline

from sklearn import ensemble
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import lightgbm as lgb
import random
import zipfile

# STEP1 ：数据预处理

In [3]:
#处理文件
def preprocessing(pathIN,pathOUT,head):
    print("===========处理开始===========")
    lines=pathIN.readlines()
    writerfile=csv.writer(pathOUT)
    writerfile.writerow(head)
    for line in lines:
        writerfile.writerow(line.strip("\n").split("\x01"))
    print("===========处理结束============")


    
#计算微博传播规模与传播深度
def caculate_width(data):
    width_list=data.groupby(["wbID"])["posted"].nunique()
    return width_list


def create_graph(data1,wb_list):
    graph_list1={}
    #num=1
    for i in wb_list:
        data=data1[data1["wbID"]==i]
        posted_list=list(data.posted.unique())
        graph_list2={}
        for posted in posted_list:
            post=list(data[data["posted"]==posted]["post"].unique())
            graph_list2[posted]=post
        graph_list1[i]=graph_list2
        #print(num)
       # num+=1
    return graph_list1

class Vertex:
    def __init__(self,num):
        self.id=num
        self.connectedTo=[]
        self.dist=0
        
    def addNeighbor(self,nbr):
        self.connectedTo.append(nbr)
        
    def setDistance(self,d):
        self.dist=d
    
    def getDistance(self):
        return self.dist
    
    def __str__(self):
        return str(self.id)+" color: " +self.color+"dist:"+str(
        self.dist)+" connectedTo:"+str([x.id for x in self.connectedTo])
    
    
class Graph:
    
    def __init__(self):
        self.vertices={}
        
    def addVertex(self,key):
        newVertex=Vertex(key)
        self.vertices[key]=newVertex
        return newVertex
    
    def getVertex(self,n):
        vertex=self.vertices.get(n)
        return vertex
    
    def __iter__(self):
        return iter(self.vertices.values())
    
def bfs(start):
    start.setDistance(1)
    distance=1
    vertlist=[]
    vertlist.append(start)
    
    while (len(vertlist)>0):
        currentVert=vertlist.pop()
        for nbr in currentVert.connectedTo:
            nbr.setDistance(currentVert.getDistance()+1)
            if nbr not in vertlist:
                vertlist.insert(0,nbr)
        if distance <currentVert.getDistance():
            distance=currentVert.getDistance()
    return distance

def caculate_depth(a,graph_list1,wb_list):
    depth_list=[]
    #num=1
    for i in wb_list:
        posted=a[a["wbID"]==i]["posted"].values[0]
        graph=Graph()
       # print(num)
       # num+=1
        if graph_list1[i]=={}:
            depth_list.append(0)
            continue  
        
        if posted not in list(graph_list1[i].keys()):
            depth_list.append(1)
        else:
            for key,value in graph_list1[i].items():
                vertex=graph.addVertex(key)
            key_list=[]
            for key,values in graph_list1[i].items():
                vertex=graph.getVertex(key)
                key_list.append(key)
                for value in values:
                    ner_vertex=graph.getVertex(value)
                    if (ner_vertex != None)&(value not in key_list):
                        vertex.addNeighbor(ner_vertex)
            depth_list.append(bfs(graph.vertices[posted]))

    return depth_list

# STEP2 ：特征工程

In [4]:
#构建用户粉丝表
azip = zipfile.ZipFile('./data/userRelations.zip')
f1 = azip.open('weibo_dc_parse2015_link_filter')
path = pd.read_csv(f1, nrows=500000)
relation_list={}
for line in path:
    fan=line.strip("\n").split("\t")[0]
    star=line.strip("\n").split("\t")[1].split("\x01")
    for i in star:
        if i not in relation_list.keys():
            relation_list[i]=[]
            relation_list[i].append(fan)
        else:
            relation_list[i].append(fan)

stars=list(relation_list.keys())
fans_number=[]
for star in stars:
    fans_number.append(len(relation_list[star]))
fans_number_list=pd.DataFrame({"user":stars,"fans_number":fans_number})

#print(type(fans_number_list))

fans_number_list.to_csv('./data/fans_number_list.csv')
fans_number_list = pd.read_csv('./data/fans_number_list.csv')

In [5]:
#fp=open("context.txt","r",encoding="utf-8")
def Features1(context,fp,fans_number_list):              
    #加入各个用户的粉丝数量作为特征
    features=context[["wbID","posted"]]
    features=features.merge(fans_number_list,left_on="posted",right_on="user",how="left")
    
    #time
    features["hour"]=context.time.apply(lambda x:int(x.split(":")[0]))
    
    #发布时间距离零点的分钟数
    time=pd.DataFrame({"wbID":context["wbID"],"time":pd.to_datetime(context["time"])})
    a=pd.to_datetime("2019-05-27 00:00:00")  #时间根据运行当天的日期改一下
    b=pd.to_datetime("2019-05-28 00:00:00")
    distance_0=[]
    for i in list(time["time"]):
        distance_0.append(min(abs((i-a).seconds/60),abs((b-i).seconds/60)))
    time["distance_0"]=distance_0
    features=features.merge(time[["wbID","distance_0"]],on="wbID",how="left")
    
    #源微博文本中包含一些特殊符号或链接的数量
    topic_num=[]
    link_num=[]
    name_num=[]
    book_num=[]
    emoji_num=[]
    at_num=[]

    topic=re.compile("#([^@]+?)#")
    link=re.compile("([hH][tT][tT][pP][sS]?:\\/\\/[^ ,‘\">\\]\\)]*[^\\. ,‘\">\\]\\)])")
    emoji=re.compile("\\[([^@]+?)\\]")
    book=re.compile("《([^@]+?)》")
    at=re.compile("@[0-9a-zA-Z\\u4e00-\\u9fa5]+")
    name=re.compile("【([^@]+?)】")
    for line in fp.readlines():
        topic_num.append(len(topic.findall(line)))
        link_num.append(len(link.findall(line)))
        emoji_num.append(len(emoji.findall(line)))
        name_num.append(len(name.findall(line)))
        book_num.append(len(book.findall(line)))
        at_num.append(len(at.findall(line)))
    features["topic_num"]=topic_num
    features["link_num"]=link_num
    features["emoji_num"]=emoji_num
    features["name_num"]=name_num
    features["book_num"]=book_num
    features["at_num"]=at_num
    return features

def features2(repost_new,fans_number_list,features):
    #统计15、30、45、60分钟这几个时刻的发布者粉丝数量
    for time in [15,30,45,60]:
        data=repost_new[repost_new["time"]<=time*60]  
        data = data.merge(fans_number_list,left_on='post',right_on='user')
        fans_sum=pd.DataFrame(data.groupby(["wbID"])["fans_number"].sum()).reset_index()
        fans_mean=pd.DataFrame(data.groupby(["wbID"])["fans_number"].mean()).reset_index()
        fans_max=pd.DataFrame(data.groupby(["wbID"])["fans_number"].max()).reset_index()
        fans_sum.columns=["wbID","fans_sum_%d"%(time)] 
        fans_mean.columns=["wbID","fans_mean_%d"%(time)]
        fans_max.columns=["wbID","fans_max_%d"%(time)]
        features=features.merge(fans_sum,on="wbID",how="left").merge(fans_mean,on="wbID",how="left").merge(fans_max,on="wbID",how="left")
    
    
    #print(features)
    #统计截至一个小时源微博发布者与转发者粉丝之间的差异
    features["fans_mean_diff"]=features["fans_number"]-features["fans_mean_60_y"]
    features["fans_max_diff"]=features["fans_number"]-features["fans_max_60"]
    features["fans_sum_diff"]=features["fans_number"]-features["fans_sum_60"]
    return features

def features3(wb_list,repost_new,a,features):
    depth=pd.DataFrame({"wbID":wb_list})
    width=pd.DataFrame({"wbID":wb_list})
    for i in range(15,75,15):
        data=repost_new[repost_new["time"]<=i*60]
        graph_list1=create_graph(data,wb_list)
        depth_list1=caculate_depth(a,graph_list1,wb_list)
        depth["depth_%d_min"%(i)]=depth_list1
        width_list1=caculate_width(a)
        width=width.merge(width_list1,on="wbID",how="left")
    width.columns=["wbID","width_15_min","width_30_min","width_45_min","width_60_min"]
    features=features.merge(depth,on="wbID",how="left").merge(width,on="wbID",how="left")
    return features

# STEP3 :模型优化

In [6]:
def ModelOptimization(model, params, x_train, y_train):
    """
        模型优化：用梯度搜索方法调参，找到模型的最优参数
        参数：model代表模型，本案例中使用XGBoost
    
    """
    
    print("Model Optimizatioin Start")
    x_train = x_train.fillna(-1)
    best_params = []
    for param in params:
        print("Optimize param", param, "...")
        cv = GridSearchCV(estimator = model, param_grid = param, scoring = "r2", cv = 3, n_jobs = -1)
        cv.fit(x_train, y_train)
        best_params.append(cv.best_params_)
    print("Model Optimizatioin Done")
    return best_params



# STEP4 :模型评估

In [7]:
def ModelEvaluation(model_width,model_depth, x_train, y_train):
    """
        模型评估：评估模型效果，
        参数：model_width代表微博规模预测模型，model_depth代表微博深度预测模型，x_train、y_train分别代表训练集特征和含标签的数据集
        注意：在评估lgb时，将该函数加入一个parameters参数
    """
    print("Model Evaluation Start")
    weiboID=list(x_train["wbID"].unique())
    testID=random.sample(weiboID,int(len(weiboID)*0.3))
    trainID=[i for i in weiboID if i not in testID ]
    
    # 计算MPAE
    print("Compute MAPE_width...")
    x_tr, x_te = x_train[x_train["wbID"].isin(trainID)],x_train[x_train["wbID"].isin(testID)]
    y_tr, y_te = y_train[y_train["wbID"].isin(trainID)],y_train[y_train["wbID"].isin(testID)]
    #dtrain=model_width.Dataset(data=x_tr.drop("wbID",axis=1),label=y_tr["y_width"])  #当评估lgb时使用这一段代码
    #dvalid=model_width.Dataset(data=x_te.drop("wbID",axis=1),label=y_te["y_width"])  #当评估lgb时使用这一段代码
    #clf=model_width.train(parameters,dtrain,num_boost_round=2000,verbose_eval=50)    #当评估lgb时使用这一段代码
    #y_pred_width = clf.predict(x_te.drop("wbID",axis=1))                           #当评估lgb时使用这一段代码
    model_width.fit(x_tr.drop("wbID",axis=1),y_tr["y_width"])           #当评估lgb时不使用这段代码
    y_pred_width = model_width.predict(x_te.drop("wbID",axis=1))      #当评估lgb时不使用这段代码
    y_te["y_pred_width"]=y_pred_width
    y_te["mae_width"]=abs(y_te["y_pred_width"]-y_te["y_width"])/y_te["y_pred_width"]
    mape_width=np.mean(y_te.groupby(["time"])["mae_width"].mean())
    
    print("Compute MAPE_depth...")
    model_depth.fit(x_tr.drop("wbID",axis=1), y_tr["y_depth"])       #当评估lgb时不使用这段代码
    y_pred_depth = model_depth.predict(x_te.drop("wbID",axis=1))      #当评估lgb时不使用这段代码
    #clf2=model_depth.train(parameters,dtrain,num_boost_round=2000,verbose_eval=50)  #当评估lgb时使用这段代码
    #y_pred_depth = clf2.predict(x_te.drop("wbID",axis=1))                           #当评估lgb时使用这段代码
    y_te["y_pred_depth"]=y_pred_depth
    y_te["mae_depth"]=abs(y_te["y_pred_depth"]-y_te["y_depth"])/y_te["y_pred_depth"]
    mape_depth=np.mean(y_te.groupby(["time"])["mae_depth"].mean())
    
    mape=0.7*mape_width+0.3*mape_depth
    print("Model Evaluation Done")
    return mape,y_pred_width,y_te["y_width"],y_pred_depth,y_te["y_depth"]

# 数据预处理

In [None]:
#将trainRepost.txt转化为数据框格式
pathIN=open("./data/trainRepost.txt","r",encoding="utf-8")
pathOUT=open("./data/trainRepost_new.csv","w",encoding="utf-8")
head=["wbID","posted","post","time","content"]
preprocessing(pathIN,pathOUT,head)

#将weibofrofile.train转化为数据框格式
pathIN_1=open("./data/WeiboProfile.train","r",encoding="utf-8")
pathOUT_1=open("./data/trainProfile_new.csv","w",encoding="utf-8")
head_1=["wbID","posted","time","context"]
preprocessing(pathIN_1,pathOUT_1,head_1)

#将内容一行一行的提取出来形成txt文档
csv_data = pd.read_csv('./data/trainProfile_new.csv')

column = csv_data['context']
f = open('./data/context.txt', 'w',encoding='utf-8')

for i in column:
    f.writelines(i)
    f.writelines('\n')

f.close()



In [10]:
#read data
path=open("./data/trainRepost_new.csv","r",encoding="utf-8")
repost=pd.read_csv(path, nrows=500000)
repost_new=repost.sort_values(["wbID","time"],ascending=True)
a=pd.read_csv("./data/trainProfile_new.csv",encoding="utf-8")
wb_list=list(set(list(repost_new.wbID.unique())).intersection(set(list(a.wbID.unique()))))


#calulate depth & width
label_depth=pd.DataFrame({"wbID":wb_list})
label_width=pd.DataFrame({"wbID":wb_list})

#缩减时间
for i in range(75, 90, 15):
#for i in range(75,4395,15):
    data=repost_new[repost_new["time"]<=i*60]
    graph_list1=create_graph(data,wb_list)
    depth_list1=caculate_depth(a,graph_list1,wb_list) #caculate_depth函数详见上述代码
    width_list1=caculate_width(data)#caculate_width函数详见上述代码
    label_depth[i]=depth_list1
    label_width=label_width.merge(width_list1,on="wbID",how="left")
    print("已经处理的时刻：",i)

# write data
label_depth.to_csv("./data/label_depth.csv","r",encoding="utf-8")
label_width.to_csv("./data/label_width.csv","r",encoding="utf-8")

已经处理的时刻： 75


# 特征工程&模型评估

In [22]:
label_depth=label_depth.set_index("wbID").stack().reset_index()
label_width=label_width.set_index("wbID").stack().reset_index()
label_depth.columns=["wbID","time","y_depth"]
label_width.columns=["wbID","time","y_width"]

#print(label_depth[0:100000])
#print(label_width[0:100000])

#label_depth.to_csv('./data/1.csv')
#label_depth = pd.read_csv('./data/1.csv')

#label_width.to_csv('./data/2.csv')
#label_width = pd.read_csv('./data/2.csv')

label_depth['wbID'] = pd.to_numeric(label_depth['wbID'], errors='coerce')
label_depth['time'] = pd.to_numeric(label_depth['time'], errors='coerce')
label_width['wbID'] = pd.to_numeric(label_width['wbID'], errors='coerce')
label_width['time'] = pd.to_numeric(label_width['time'], errors='coerce')

#print(type(label_depth))
#print(label_depth['wbID'])


y_train=pd.merge(label_depth,label_width,on=["wbID","time"],how="outer")

context=pd.read_csv("./data/trainProfile_new.csv",encoding="utf-8")
fp=open("./data/context.txt","r",encoding="utf-8")


features=Features1(context,fp,fans_number_list)

#features.to_csv('./data/2.csv')
#features = pd.read_csv('./data/2.csv')

train_data=y_train.merge(features,on="wbID",how="left")
train_data=train_data[train_data["y_depth"]!=0]

x_train1=train_data[['wbID','hour', 'fans_number',  'distance_0', 'topic_num', 'link_num','name_num',  'emoji_num', "time"]]
#print(x_train1)
#x_train1=train_data[['wbID','hour', 'fans_number',  'distance_0', 'topic_num', 'link_num','name_num', 'other_num', 'emoji_num', 'other1_num',"time"]]
y_train1=train_data[['wbID',"y_width","y_depth","time"]]
x_train1["fans_number"]=x_train1["fans_number"].fillna(x_train1["fans_number"].mean())

x_train1["time"]=pd.to_numeric(x_train1["time"])
x_train1 = x_train1.fillna(0)
y_train1 = y_train1.fillna(1)

rfr1= ensemble.RandomForestRegressor()
rfr2= ensemble.RandomForestRegressor()


mape_rf1,y_pred_width_rf1,y_width_rf1,y_pred_depth_rf1,y_depth_rf1=ModelEvaluation(rfr1,rfr2,x_train1, y_train1)
print(mape_rf1)

xgb1=XGBRegressor()
xgb2=XGBRegressor()
mape_xgb1,y_pred_width_xgb1,y_width1,y_pred_depth_xgb1,y_depth1=ModelEvaluation(xgb1,xgb2,x_train1, y_train1)
print(mape_xgb1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Start
Compute MAPE_width...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Done
1.2384863905701613
Model Evaluation Start
Compute MAPE_width...


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...
Model Evaluation Done
0.8135881583395294


In [24]:
'''
parameters = {
    'application': 'regression',
    'objective': "regression",
    'metric': 'mse',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}
model_width=lgb
model_depth=lgb

mape_lgb,y_pred_width_lgb,y_te_width_lgb,y_pred_depth_lgb,y_te_depth_lgb=ModelEvaluation(model_width,model_depth, x_train1, y_train1,parameters)
print("score:",mape_lgb)

'''

'\nparameters = {\n    \'application\': \'regression\',\n    \'objective\': "regression",\n    \'metric\': \'mse\',\n    \'is_unbalance\': \'true\',\n    \'boosting\': \'gbdt\',\n    \'num_leaves\': 31,\n    \'feature_fraction\': 0.5,\n    \'bagging_fraction\': 0.5,\n    \'bagging_freq\': 20,\n    \'learning_rate\': 0.05,\n    \'verbose\': 0\n}\nmodel_width=lgb\nmodel_depth=lgb\n\nmape_lgb,y_pred_width_lgb,y_te_width_lgb,y_pred_depth_lgb,y_te_depth_lgb=ModelEvaluation(model_width,model_depth, x_train1, y_train1,parameters)\nprint("score:",mape_lgb)\n\n'

In [60]:
repost_new=pd.read_csv("./data/trainRepost_new.csv", nrows=500000,encoding="utf-8")

#print(repost_new)
#print(fans_number_list)

features=features2(repost_new,fans_number_list,features)
train_data=y_train.merge(features,on="wbID",how="left")
train_data=train_data[train_data["y_depth"]!=0]

#print(train_data)


x_train2=train_data[['wbID','hour', 'fans_number', 'fans_sum_30',
       'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff', 'distance_0', 'topic_num', 'link_num',
       'name_num', 'emoji_num',"time"]]

y_train2=train_data[['wbID',"y_width","y_depth","time"]]
x_train2[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]]=x_train2[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]].fillna(x_train2[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]].mean())
x_train2["time"]=pd.to_numeric(x_train2["time"])
x_train2 = x_train2.fillna(0)
y_train2 = y_train2.fillna(1)

rfr1= ensemble.RandomForestRegressor()
rfr2= ensemble.RandomForestRegressor()
mape_rf2,y_pred_width_rf2,y_width_rf2,y_pred_depth_rf2,y_depth_rf2=ModelEvaluation(rfr1,rfr2,x_train2, y_train2)
print(mape_rf2)

xgb1=XGBRegressor()
xgb2=XGBRegressor()
mape_xgb2,y_pred_width_xgb2,y_width2,y_pred_depth_xgb2,y_depth2=ModelEvaluation(xgb1,xgb2,x_train2, y_train2)
print(mape_xgb2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Start
Compute MAPE_width...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Done
1.2319708161220575
Model Evaluation Start
Compute MAPE_width...


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...
Model Evaluation Done
0.8190815210092893


In [None]:
'''
mape_lgb2,y_pred_width_lgb2,y_te_width_lgb2,y_pred_depth_lgb2,y_te_depth_lgb2=ModelEvaluation(model_width,model_depth, x_train2, y_train2,parameters)
print("score:",mape_lgb2)

'''

In [99]:
repost_new=pd.read_csv("./data/trainRepost_new.csv", nrows=500000,encoding="utf-8")
wb_list=list(set(context["wbID"]).intersection(repost_new["wbID"]))
features=features3(wb_list,repost_new,context,features)
train_data=y_train.merge(features,on="wbID",how="left")
train_data=train_data[train_data["y_depth"]!=0]

#print(train_data)


x_train3=train_data[['wbID','hour', 'y_width', 'y_depth',
       'fans_number', 'width_15_min_y', 'depth_15_min_y', 'width_30_min_y',
       'depth_30_min_y', 'width_45_min_y', 'depth_45_min_y', 'fans_sum_30',
       'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff', 'distance_0', 'topic_num', 'link_num',
       'name_num', 'emoji_num',"time"]]
y_train3=train_data[['wbID',"y_width","y_depth","time"]]
x_train3[["depth_15_min","depth_30_min","depth_45_min","depth"]]=x_train3[["depth_15_min_y","depth_30_min_y","depth_45_min_y","y_depth"]].T.drop_duplicates().T.fillna(1)
x_train3[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]]=x_train3[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]].fillna(x_train3[['fans_sum_30', 'fans_mean_30', 'fans_max_30', 'fans_sum_15', 'fans_mean_15',
      'fans_max_15', 'fans_sum_45', 'fans_mean_45', 'fans_max_45',
       'fans_sum_60', 'fans_mean_60', 'fans_max_60', 'fans_mean_diff',
       'fans_max_diff', 'fans_sum_diff',"fans_number"]].mean())

x_train3["time"]=pd.to_numeric(x_train3["time"])
x_train3 = x_train3.fillna(0)
y_train3=y_train3.fillna(1)

rfr1= ensemble.RandomForestRegressor()
rfr2= ensemble.RandomForestRegressor()
x_train3 = x_train3.T.drop_duplicates().T
mape_rf3,y_pred_width_rf3,y_width_rf3,y_pred_depth_rf3,y_depth_rf3=ModelEvaluation(rfr1,rfr2,x_train3, y_train3)
print(mape_rf3)

xgb1=XGBRegressor()
xgb2=XGBRegressor()
mape_xgb3,y_pred_width_xgb3,y_width3,y_pred_depth_xgb3,y_depth3=ModelEvaluation(xgb1,xgb2,x_train3, y_train3)
print(mape_xgb3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Start
Compute MAPE_width...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Model Evaluation Done
2.8645107905374595e-05
Model Evaluation Start
Compute MAPE_width...


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Compute MAPE_depth...
Model Evaluation Done
0.00033576642992309364


In [None]:
mape_lgb3,y_pred_width_lgb3,y_te_width_lgb3,y_pred_depth_lgb3,y_te_depth_lgb3=ModelEvaluation(model_width,model_depth, x_train3, y_train3,parameters)
print("score:",mape_lgb3)

In [None]:
# xgb_params = [{"learning_rate": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]}, 
#               {"n_estimators": [100, 300, 500, 1000]}, 
#               {"max_depth": range(3,10,2)}, 
#               {"min_child_weight": range(1,6,2)}, 
#               {"gamma": [i/10.0 for i in range(0,5)]}, 
#               {"subsample": [i/10.0 for i in range(6,10)]},
#               {"colsample_bytree": [i/10.0 for i in range(6,10)]}, 
#               {"reg_alpha": [1e-5, 1e-2, 0.1, 1, 100]}]
# xgb_best_params = ModelOptimization(XGBRegressor(), xgb_params, x_train.drop("wbID",axis=1), y_train["y_width"])

               wbID  hour  y_width  y_depth  fans_number  depth_15_min_y  \
0      3.953269e+15  12.0      1.0     75.0          1.0             0.0   
1      3.953269e+15  12.0      1.0      1.0          1.0             0.0   
2      3.970825e+15  23.0      1.0     75.0          1.0             0.0   
4      3.918646e+15  23.0      1.0     75.0          1.0             0.0   
6      3.967101e+15  16.0      6.0     75.0          1.0             1.0   
7      3.967101e+15  16.0      6.0      1.0          1.0             1.0   
8      3.921389e+15  13.0      1.0     75.0          1.0             1.0   
9      3.921389e+15  13.0      1.0      1.0          1.0             1.0   
10     3.968646e+15  22.0      7.0     75.0          1.0             1.0   
11     3.968646e+15  22.0      7.0      1.0          1.0             1.0   
12     3.923172e+15  11.0      1.0     75.0          1.0             0.0   
14     3.972130e+15  13.0      1.0     75.0          1.0             0.0   
15     3.972