In [4]:
import pandas as pd
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from collections import Counter
import math

In [5]:
# rg

def radius_of_gyration(positions):
    """
    position : tuple
        A tuple (lat, lon) with the latitude and longitude of the antenna,
        encoded as floating point numbers.

    Returns the radius of gyration, the *equivalent distance* of the mass from
    the center of gravity, for all visited places. [GON2008]_

    References
    ----------
    .. [GON2008] Gonzalez, M. C., Hidalgo, C. A., & Barabasi, A. L. (2008).
        Understanding individual human mobility patterns. Nature, 453(7196),
        779-782.
    """
    d = Counter(positions)
    sum_weights = sum(d.values())
    positions = list(d.keys())  # Unique positions

    if len(positions) == 0:
        return None

    barycenter = [0, 0]
    for pos, t in d.items():
        barycenter[0] += pos[0] * t
        barycenter[1] += pos[1] * t

    barycenter[0] /= sum_weights
    barycenter[1] /= sum_weights

    r = 0.
    for pos, t in d.items():
        r += float(t) / sum_weights * \
            great_circle_distance(barycenter, pos) ** 2
    return math.sqrt(r)

def great_circle_distance(pt1, pt2):
    """
    Return the great-circle distance in kilometers between two points,
    defined by a tuple (lat, lon).
    Examples
    --------
    >>> brussels = (50.8503, 4.3517)
    >>> paris = (48.8566, 2.3522)
    >>> great_circle_distance(brussels, paris)
    263.9754164080347
    """
    r = 6371.

    delta_latitude = math.radians(pt1[0] - pt2[0])
    delta_longitude = math.radians(pt1[1] - pt2[1])
    latitude1 = math.radians(pt1[0])
    latitude2 = math.radians(pt2[0])

    a = math.sin(delta_latitude / 2) ** 2 + math.cos(latitude1) * math.cos(latitude2) * math.sin(delta_longitude / 2) ** 2
    return r * 2. * math.asin(math.sqrt(a))


In [9]:
corpus=pd.read_csv('./category/node.csv',index_col=0,names=['user_id','app'])

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
corpus=corpus['app'].values

In [14]:
corpus

array(['微信', '手机腾讯网', 'QQ', ..., '搜搜地图', 'QQ欢乐斗地主', 'QQ音乐'], dtype=object)

In [13]:
# 300 dimensions
model = Word2Vec(corpus, size=300, window=10, min_count=5,
                workers=multiprocessing.cpu_count(), hs=1, negative=0) # negative sampling, hs: hierarchical softmax
word_vectors = model.wv
word_vectors.save_word2vec_format('./book_vec_300_dims.txt')
word_vectors = KeyedVectors.load_word2vec_format('./book_vec_200_dims.txt')

vector = model.wv['computer']

TypeError: 'float' object is not iterable

In [None]:
# rg

for vector in word_vectors.vocab.values():
    # reduce the dimensions from 300 to 2
    # or use 2-dimensions when training
    radius_of_gyration(vector)

In [15]:
# coding: utf-8

# In[1]:

import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from sklearn import ensemble
from sklearn import datasets
from sklearn import metrics
from sklearn import linear_model
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error


# In[2]:

import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import Counter
import scipy
import time
import powerlaw

import matplotlib
import matplotlib.font_manager as font_manager
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import matplotlib.cm as cm
get_ipython().magic('matplotlib inline')
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn import ensemble
from sklearn.metrics import mean_squared_error

path = '/System/Library/Fonts/Hiragino Sans GB W3.ttc'
fontprop = font_manager.FontProperties(fname=path)

matplotlib.style.use('_classic_test')
matplotlib.rc("figure", facecolor="white")
# matplotlib.style.use('seaborn')


# In[ ]:

df = pd.read_csv('./localdata/all_attenion_clean.csv')


# In[ ]:

pd.Series(df.groupby('uid')['book_id'])[:10]


# In[ ]:

corpus=[]
groupbySeries = pd.Series(df.groupby('uid')['book_id'])
for i in range(len(groupbySeries)):
    corpus.append(list(groupbySeries[i][1].values))


# In[ ]:

corpus[0][:10]


# In[ ]:

model = Word2Vec(corpus, size=100, window=5, min_count=1,
                workers=multiprocessing.cpu_count(), hs=1, negative=0) # negative sampling, hs: hierarchical softmax
word_vectors = model.wv
word_vectors.save_word2vec_format('./book_vec.txt')


# In[ ]:

word_vectors = KeyedVectors.load_word2vec_format('./book_vec.txt')


# In[15]:

vectors = [word_vectors.wv[book] for book in word_vectors.vocab]


# In[16]:

len(vectors)


# In[ ]:

vectors = [tuple(i) for i in vectors]


# In[17]:

vectors[:5]


# In[ ]:

from collections import Counter
import math

dimensions = 300
def distance(a, b):
    return math.sqrt(a-b)

def radius_of_gyration(positions):
    d = Counter(positions)
    sum_weights = sum(d.values())
    positions = list(d.keys())  # Unique positions

    if len(positions) == 0:
        return None

    barycenter = [0] * dimensions
    for pos, t in d.items():
        for i in dimensions:
            barycenter[i] += pos[i] * t

    for i in dimensions:
        barycenter[i] /= sum_weights

    r = 0.
    for pos, t in d.items():
        r += float(t) / sum_weights *                 np.linalg.norm(np.array(barycenter)-np.array(pos)) ** 2
    return math.sqrt(r)


# In[ ]:




# In[ ]:




# In[191]:



# In[192]:

rg = []
for name, group in df.groupby('uid')['book_id']:
    userBookLocations=[]
#     print(name, list(group.values))
#     print(groupbySeries[i][0], list(groupbySeries[i][1].values))
    for book in list(group.values):
        userBookLocations.append(tuple(word_vectors.wv[book]))
    rg.append(radius_of_gyration(userBookLocations))


# In[193]:

plt.hist(rg)


# In[194]:

len(rg)


# In[195]:



# In[196]:

mu = np.mean(rg)
sigma = np.std(rg)
count, bins, ignored = plt.hist(rg, 40, normed=True)
plt.xlabel('$Radius\ of\ Gyration(log)$', size=20)
plt.ylabel('$P(R_g)$', size=20)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * 
         np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
         linewidth=2, color='r')
plt.show()


# In[132]:

rg_log = []
for i in rg:
    if i >= 1:
        rg_log.append(np.log(i))


# In[133]:

mu = np.mean(rg_log)
sigma = np.std(rg_log)
count, bins, ignored = plt.hist(rg_log, 40, normed=True)
plt.xlabel('$Radius\ of\ Gyration(log)$', size=20)
plt.ylabel('$P(R_g)$', size=20)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * 
         np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
         linewidth=2, color='r')
plt.show()


# In[104]:

a = pd.read_csv('./users1000.csv', index_col=0)
a['RadiusofGyration'] = rg
a.to_csv('./users1000.csv', index=True)


# In[103]:

a


# In[ ]:


from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import multiprocessing


def train_word2vec(corpus, dimensions=200, filename=None):
    model = Word2Vec(corpus, size=200, window=10, min_count=5,
                     workers=multiprocessing.cpu_count(), hs=1,
                     negative=0)  # negative sampling, hs: hierarchical softmax
    word_vectors = model.wv

    if filename != None:
        # Save the model
        word_vectors.save_word2vec_format(filename)


import matplotlib
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt

path = '/System/Library/Fonts/Hiragino Sans GB W3.ttc'
fontprop = font_manager.FontProperties(fname=path)
from sklearn.manifold import TSNE


def visualize_word_vectors(word_vectors):
    vec_dict = dict()
    for book_id in vocab_list[:60]:
        vec = word_vectors.word_vec(book_id)
        vec_dict[book_id] = vec

    color_dict = dict(matplotlib.colors.BASE_COLORS, **matplotlib.colors.CSS4_COLORS)
    color_list = list(color_dict.keys())[15:]
    print(color_list[:20])

    X_tsne = TSNE(learning_rate=10).fit_transform(list(vec_dict.values()))

    X_tsne_dict = dict()
    for book in vec_dict.keys():
        X_tsne_dict[book] = X_tsne[list(vec_dict.keys()).index(book)]

    # 可以按照类别标出颜色

    plt.figure(figsize=(30, 20))

    for category in CateId2Cate.keys():
        for book_id in X_tsne_dict.keys():
            book_category = bookId2Cate[book_id]
            if book_category == category:
                color = color_list[CateId2Cate[category]]
                plt.scatter(X_tsne_dict[book_id][0], X_tsne_dict[book_id][1], c=color, label=category, s=250, alpha=0.4)
                plt.text(X_tsne_dict[book_id][0], X_tsne_dict[book_id][1], bookId2Name[book_id],
                         fontproperties=fontprop, fontsize=20)

    plt.legend(loc=3, prop=fontprop)
    # plt.savefig('./200-dim-2.png', dpi=200)


import numpy as np
import pandas as pd

import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt

import pickle
import datetime
import statsmodels.api as sm

import multiprocessing
from gensim.models import Word2Vec

path = '/System/Library/Fonts/Hiragino Sans GB W3.ttc'
fontprop = font_manager.FontProperties(fname=path)

# data_dir = '/Users/zhicongchen/Desktop/datasets/baidureading/'

data_dir = './baidureading/'

book_df = pd.read_csv(data_dir + 'book_info.csv')

bookId = book_df['book_id'].values

# bookName = book_df['book_name'].values
# bookCate = book_df['category'].values
# bookPrice = book_df['price'].values
# bookCateId = book_df['categoryid'].values
# CateId2Cate = dict(book_df[['category', 'categoryid']].drop_duplicates().values)
# bookId2Name = dict(book_df[['book_id', 'book_name']].drop_duplicates().values)
# bookId2Price = dict(book_df[['book_id', 'price']].drop_duplicates().values)
# bookId2Cate = dict(book_df[['book_id', 'category']].drop_duplicates().values)

bookId2Sale = dict(book_df[['book_id', 'salecount']].drop_duplicates().values)


# all_attention = pd.read_csv(data_dir + 'attentionflow/all_attenion.csv')

def attentionflow2corpus(all_attention):
    reading_flow = all_attention.groupby('uid').groups
    print("Num of users: ", len(reading_flow))
    corpus = dict()
    for user in reading_flow.keys():
        corpus[user] = list(all_attention.iloc[reading_flow[user]]['book_id'].drop_duplicates().values)

    # Dump corpus into file
    f = open('corpus_dict.txt', 'wb')
    pickle.dump(corpus, f)
    f.close()
    return corpus


if __name__ == '__main__':
    # Read corpus from file
    f = open('corpus_list.txt', 'rb')
    corpus = pickle.load(f)
    f.close()

    # Load the model
    # word_vectors = KeyedVectors.load_word2vec_format('./book_vec_200_dims.txt')

    scores = []
    dimensions = np.arange(1600, 5000, 50)
    for s in dimensions:
        print("Dimensions: %d" % s)
        start_time = datetime.datetime.now()
        model = Word2Vec(corpus, size=s, window=10, min_count=5,
                         workers=multiprocessing.cpu_count(), hs=1,
                         negative=0)  # negative sampling, hs: hierarchical softmax
        word_vectors = model.wv
        vocab_list = list((set(bookId) & set(word_vectors.vocab.keys())))

        # Load data
        y = np.array([bookId2Sale[book_id] for book_id in vocab_list])
        x = np.array([word_vectors.word_vec(word) for word in vocab_list])

        r2 = sm.OLS(y, x).fit().rsquared
        print("R2: ", r2)
        scores.append(r2)

        end_time = datetime.datetime.now()
        print("Time Consumed: ", end_time - start_time)

    print(scores)
    plt.figure(figsize=(6, 4))
    plt.plot(scores, 'o-')
    plt.xticks(range(len(dimensions)), dimensions)
    plt.xlabel('$Dimensions$', fontsize=15)
    plt.ylabel('$R^2$', fontsize=15)
    plt.title('OLS Regression', fontsize=18)
    plt.savefig('./OLS_regression.png', dpi=200)

  from pandas.core import datetools


FileNotFoundError: File b'./localdata/all_attenion_clean.csv' does not exist

In [1]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [17]:
path = '/Volumes/My Book/data/phone/流量数据/gprs_bh_'
for i in range(20131201, 20131232):
    print(i)
    filename=str(i)
    data=pd.read_csv(path+filename+'.del.tar.gz',usecols=[0,5,6,9,10,11])
    with open('/Users/xuhuimin/Documents/lab/labdata/beijingmobile/node.csv','a+') as f:
        data.to_csv(f,header=False)

20131201
20131202
20131203
20131204
20131205
20131206
20131207
20131208
20131209
20131210
20131211
20131212
20131213
20131214


KeyboardInterrupt: 

In [2]:
path = '/Volumes/My Book/data/phone/流量数据/gprs_bh_'
for i in range(20131214, 20131232):
    print(i)
    filename=str(i)
    data=pd.read_csv(path+filename+'.del.tar.gz',usecols=[0,5,6,9,10,11])
    with open('/Users/xuhuimin/Documents/lab/labdata/beijingmobile/node.csv','a+') as f:
        data.to_csv(f,header=False)

20131214
20131215
20131216
20131217
20131218
20131219
20131220
20131221
20131222
20131223
20131224
20131225
20131226
20131227
20131228
20131229
20131230
20131231
