[Dataset](https://www.aminer.org/citation)

In [None]:
import os

def preprocessing(filename = "dblpv13.json"):
  '''
    將NumberInt(N)置換成整數N

    TODO: 111/9/6 bigjson替換成ijson
  '''

  with open(DIR_PATH + filename, "r") as f:
    with open(DIR_PATH + "output", "w") as outputFile:
      while True:
        content = f.readline()
        if content == "": 
          break

        erasePosStart = content.find("NumberInt(")
        if erasePosStart != -1:
          erasePosEnd = content.find(")", erasePosStart)
          content = content[:erasePosStart] + \
                content[erasePosStart+10 : erasePosEnd] + \
                content[erasePosEnd+1:]

        outputFile.write(content)
  try:
    f = open(DIR_PATH + "output", "rb")
    jsonFormat = bigjson.load(f)
    size = len(jsonFormat)
  except Exception as e:
    print(str(e))
    return
  else:
    os.remove(DIR_PATH + filename)
    os.rename(DIR_PATH + "output", DIR_PATH + filename)


In [None]:
import ijson
from os.path import exists

def captureNodes():
    '''
    從原始資料集中擷取authers的id作為節點，並且排除掉重複後輸出至node
    '''
    nodes = set()
    with open(DIR_PATH + "dblp.json", "rb") as f:
        authors = ijson.items(f, "item")
        author = ""
        try:
            for author in authors:
                if "_id" in author:
                    nodes.add(author["_id"])
        except Exception as e:
            print(author)
          
    with open(DIR_PATH + "nodes", "w") as output:
        for node in nodes:
            output.write("\"" + node + "\"\n")

def captureEdges():
    '''
    若任兩個節點只要有一篇共同著作，則視為有邊連在一起
    '''
    output = open(DIR_PATH + "edges", "w")
    with open(DIR_PATH + "dblp.json", "rb") as f:
        parser = ijson.parse(f)
        for prefix, event, value in parser:
            if prefix == "item.authors" and event == "start_array":
                authorsInSamePaper = set()
                while event != "end_array":
                    prefix, event, value = next(parser)
                    if prefix == "item.authors.item._id":
                        authorsInSamePaper.add(value)
                        

        for c in combinations(authorsInSamePaper, 2):
            output.write(c[0] + "," + c[1] + "\n")



In [1]:
import sys

IN_COLAB = 'google.colab' in sys.modules
DIR_PATH = ""

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/dirve")
    DIR_PATH = r"/content/dirve/MyDrive/研究所/Data/dblp/"
    sys.path.append('/content/dirve/MyDrive/Colab Notebooks/package')
else:
    DIR_PATH = r"D:\\論文實驗\\data\\dblp\\"
    sys.path.append('D:\\論文實驗\\package')
    sys.path.append("D:\\論文實驗\\env\\Lib\\site-packages")

In [1]:
import importlib
import networkx as nx 
import unittest
from coupon import Coupon
from social_graph import SN_Graph
from model import DiffusionModel
import logging

if __name__ == "__main__":
    
    testRunner = unittest.TextTestRunner()
    suite = unittest.defaultTestLoader.discover("./test/")
    testRunner.run(suite)
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
    
    # graph = SN_Graph()
    # graph.construct(DIR_PATH + "edges", DIR_PATH + "topic_nodes.csv")
    # subgraph = graph.sampling_subgraph(10)
    # nx.write_gml(subgraph, DIR_PATH + "sample10_graph.gml")
    
    
    graph = SN_Graph()
    graph.add_edge(0, 1, weight=0.01, is_tested=False)
    graph.add_edge(0, 2, weight=1, is_tested=False)
    for node in graph:
        graph.nodes[node]['desired_set'] = None
        graph.nodes[node]['adopted_set'] = None
            
    
    topic = {
            '0': [0.82, 0.19],
            '1': [0.63, 0.37],
            '2': [0.5, 0.5]
        }
    price = [60,260,70]
    
    model = DiffusionModel(
        "test",
        graph, 
        {"price": price, "topic": topic},
        [Coupon(180, [0], 20, [0,1]),]
    )

    
    model.diffusion()


ModuleNotFoundError: No module named 'coupon'

In [3]:
print(model._itemset.TOPIC)


{'0': [0.82, 0.19], '1': [0.63, 0.37], '2': [0.5, 0.5]}


In [3]:
model.load(DIR_PATH + "sample10_graph.gml")
print(model._graph.nodes[1]["topic"])
print(type(model._graph.nodes[2]["desired_set"]))


[0.8398245136169739, 0.16017548638302617]
<class 'itemset.Itemset'>


In [4]:
print( model._graph.edges[0,2]["is_tested"] == False)

False


![image](https://cdn.discordapp.com/attachments/498518865802952706/1019575970103050240/unknown.png)

In [None]:
with open(DIR_PATH + "dblp.json", "r") as f:
    for line in range(200):
        print(f.readline(), end="")

# Extract topics from DBLP dataset
### Note
- ~~未清除keywords和fos都沒有值的Paper~~

In [None]:
import ijson
import re
import csv
from gensim.utils import simple_preprocess

def extractUsersCorpos(sample=0):
    
    content = dict()
    with open(DIR_PATH + "dblp.json", "rb") as f:
        parser = ijson.parse(f)
        authors = []
        '''
            將論文的fos跟keywords對應到該篇作者的topic content
        '''
        count = 0
        for prefix, event, value in parser:
            if prefix == "item" and event == "start_map":
                authors = []
                if sample != 0:
                    count += 1
                    if count > sample:
                        break

            if prefix == "item.authors.item._id":
                authors.append(value)

            if prefix ==  "item.keywords.item" or prefix == "item.fos.item":
                text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', ' ', value).lower()
                text = text.replace('\n', ' ')
                for author in authors:
                    if author in content and text not in content[author]:
                        content[author].append(text)
                    else:
                        content[author] = [text]

    '''
        Length of each row is not fixed.

        id, string, string, ...
    '''
    filename = DIR_PATH + "topic_nodes.csv" if sample == 0 else DIR_PATH + "sample" + str(sample) + "topic_nodes.csv"
    with open(filename, "w", encoding="utf-8", newline='') as outputFile:
        writer = csv.writer(outputFile)
        for author_id, topics in content.items():
            topics.insert(0, author_id)
            writer.writerow(topics)
        
# def extractItemsCorpos(sample=0):
#     content = dict()
#     with open(DIR_PATH + "dblp.json", "rb") as f:
#         parser = ijson.parse(f)
#         '''
#             將論文的fos跟keywords對應到該篇的_id
#         '''
#         count = 0
#         for prefix, event, value in parser:
            
#             if sample != 0 and count > sample - 1:
#                 break
                
#             if prefix == "item" and event == "end_map" and paper_id in content:
#                 count += 1
                
#             if prefix == "item._id":
#                 paper_id = value
#                 print(value)
                
#             if prefix ==  "item.keywords.item" or prefix == "item.fos.item":
#                 text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', ' ', value).lower()
#                 text = text.replace('\n', ' ')
#                 if paper_id in content and text not in content[paper_id]:
#                     content[paper_id].append(text)
#                 else:
#                     content[paper_id] = [text]
                    
            
                

#     '''
#         Length of each row is not fixed.

#         id, string, string, ...
#     '''
#     filename = DIR_PATH + "topic_items.csv" if sample == 0 else DIR_PATH + "sample" + str(sample) + "topic_items.csv"
#     with open(filename, "w", encoding="utf-8", newline='') as outputFile:
#         writer = csv.writer(outputFile)
#         for paper_id, topics in content.items():
#             topics.insert(0, paper_id)
#             writer.writerow(topics)

extractUsersCorpos(10)

In [15]:
with open(DIR_PATH + "dblp.json", "rb") as f:
    parser = ijson.parse(f)
    count = 0
    for prefix, event, value in parser:        
        if prefix == "item" and event == "start_map":
            count += 1
    print(count)

834826


In [None]:
import ijson
import re

with open(DIR_PATH + "dblp.json", "rb") as f:
    parser = ijson.parse(f)
    count = 0
    for prefix, event, value in parser:
        print(prefix,event,value)
        if count > 100:
            break
        count += 1
            

In [5]:
from nltk.corpus import stopwords
from os import listdir
import nltk

nltk.download('stopwords', download_dir="./nltk_data")

[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
import math

x_1 = [0.6,0.1,0.3]
x_2 = [0.6,0.3,0.1]
x_3 = [0.5,0,0.5]

R_prime = [[ 0.0, 0.8, 1.0],
           [-0.7, 0.0, 0.8],
           [ 0.3, 0.2, 0.0],]

NUM_ITEMS = 3
NUM_TOPICS = 3

def transform(matrix):
    def sigmoid(num):
        return 1+(1/(1+math.exp(-num)))
    
    for i in range(NUM_ITEMS):
        for j in range(NUM_ITEMS):
            if i == j:
                continue
                
            matrix[i][j] = sigmoid(matrix[i][j])
    return matrix

def get_weight(number_item, weights):
    total_weights = 0
    for i in range(weights):
        if i == number_item:
            continue
        total_weights += weights[i]
    return total_weights

R_prime = transform(R_prime)
print(R_prime)


[[0.0, 1.6899744811276125, 1.7310585786300048], [1.3318122278318338, 0.0, 1.6899744811276125], [1.5744425168116591, 1.549833997312478, 0.0]]


In [1]:
with open("./data/dblp/smple10_topic_nodes.csv", "r", encoding="utf8") as f:
    sample = []
    for i in range(100):
        print(repr(f.readline()), end="")
        

'53f45728dabfaec09f209538,moisture,hydrology,environmental science,dry weight,water content,stomatal conductance,transpiration,irrigation,soil water,canopy\n''5601754345cedb3395e59457,moisture,hydrology,environmental science,dry weight,water content,stomatal conductance,transpiration,irrigation,soil water,canopy\n''53f38438dabfae4b34a08928,moisture,hydrology,environmental science,dry weight,water content,stomatal conductance,transpiration,irrigation,soil water,canopy\n''5601754345cedb3395e5945a,moisture,hydrology,environmental science,dry weight,water content,stomatal conductance,transpiration,irrigation,soil water,canopy\n''53f43d25dabfaeecd6995149,moisture,hydrology,environmental science,dry weight,water content,stomatal conductance,transpiration,irrigation,soil water,canopy\n''53f46797dabfaeb22f542630,pattern recognition,computer science,feature  computer vision ,document processing,handwriting recognition,optical character recognition,feature extraction,feature  machine learning ,a