# Feature engineering process Waldor

## Part 1.

### Import libraries that we will use

In [6]:
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from gensim.models.word2vec import Word2Vec
import nltk
import csv
import igraph
import math

from read_data import *
from graph_creation import *

### Initialization like in public_baseline

In [7]:
# ---First Initializations--- #
path_to_data = "../data/"
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

# ---Read Data--- #
testing_set, training_set, node_info = read_data()
IDs = [element[0] for element in node_info]

# ---Compute TFIDF vector of each paper--- #
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)
pairwise_similarity = features_TFIDF * features_TFIDF.T
#print pairwise_similarity.shape
# ---Create graph--- #
g = create_graph(training_set, IDs)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
creating graph
graph created


## Part 2.

## Additional to public_baseline graph features 

### 1) Word2Vec

In [8]:
def build_w2v(node_info, stemmer, stpwds):
    try:
        model = Word2Vec.load("w2v_model")
        print ("Word2Vec model loaded")
    except:
        path_to_google_news = '../data'
        my_q = 300 # to match dim of GNews word vectors
        mcount = 5
        model = Word2Vec(size=my_q, min_count=mcount)
        cleaned_abstracts = [clean(element[5], stemmer, stpwds) for element in node_info]
        print ("Building Word2Vec vocab...")
        model.build_vocab(cleaned_abstracts)
        print ("Loading intersect vectors...")
        model.intersect_word2vec_format(path_to_google_news + 'GoogleNews-vectors-negative300.bin.gz', binary=True)
        model.save("w2v_model")
        print ("Model saved to disk")
    return model

### 2) avg_number_citations_of_authors

The function to create a dictionary where we put the number of references from author1 to author2.

Example if three references (three articles):

    (author1, author2) : 3

In [9]:
# Only one time
def count_authLinksStoT (information_set, node_info):
    authLinks = {}
    counter = 0
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")
        
        for s in source_auth:
            s.replace(' ', '')
        for t in target_auth:
            t.replace(' ', '')
        
        for s in source_auth:
            for t in target_auth:
                key = (s,t)
                if key in authLinks:
                    authLinks[key] += 1
                else:
                    authLinks[key] = 1
        counter += 1
        if counter % 10000 == 0:
            print (counter, "examples processed")
    return authLinks

Count the average number of citations the authors of target have received from authors of source FOR ONE RECORD.

In [10]:
def avg_number_citations(avg_number_citations_of_authors, source_auth, target_auth, authLinks):
    summ = 0
    count = 0
    for s in source_auth:
        for t in target_auth:
            key = (s,t)
            if key in authLinks:
                summ += authLinks[key]
                count += 1
    if count == 0:
        avg_number_citations_of_authors.append(0)
    else:
        avg_number_citations_of_authors.append(summ/count)
    return

### 3) shortest_path and edge_connectivity

We create the full NON directed graph of relations between the abstracts to calculate the shortest path and the edge sonnectivity using igraph.

In [11]:
# Only one time
def undirected_graph(information_set, IDs):
    edges = [(element[0], element[1]) for element in information_set if element[2]=="1"]
    nodes = IDs

    print ("Edges and nodes for undirected graph prepared.")
    graph = igraph.Graph(directed=False)
    graph.add_vertices(nodes)
    graph.add_edges(edges)

    return graph

The function to calculate a shortest edge FOR ONE RECORD.

In [12]:
def shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target):
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    if (edge == "1"):
        graph.delete_edges([(source,target)])
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)
        graph.add_edges([(source,target)])
    else:
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)

## Part 3.

## Feature engineering

Clean function from feture_engineering.py

In [13]:
def clean(s, stemmer, stpwds):
    s = s.lower().split(" ")
    s = [token for token in s if token not in stpwds]
    s = [stemmer.stem(token) for token in s]
    s = [''.join([elt for elt in token if not elt.isdigit()]) for token in s] # remove digits
    s = [token for token in s if len(token)>2] # remove tokens shorter than 3 characters in size
    s = [token for token in s if len(token)<=25] # remove tokens exceeding 25 characters in size
    return s

Feature_engineering function.

In [14]:
def first_step (information_set, node_info, IDs) :
    # Undirected graph to calculate shortest_path and edge_connectivity
    graph = undirected_graph(information_set, IDs)
    print ("Undirected graph created.")
    print ("First step finished.\n")
    return graph

In [15]:
def second_step (information_set, IDs, node_info, stemmer, stpwds, g): 
    # the average number of citations the authors of target have received from authors of source
    avg_number_citations_of_authors = [] 
    
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    print ("Second step - start to calculate features.")
    
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")

        # Generated features
        avg_number_citations(avg_number_citations_of_authors, source_auth, target_auth, authLinks)

        counter += 1
        if counter % 10000 == 0:
            print (counter, "examples processed")
            
    print ("2 step - All features calculated")

    #### ---- Final features array ---- ####
    list_of_features = []
    list_of_features.append(avg_number_citations_of_authors)
    
    return list_of_features

In [23]:
def third_step (information_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v):    
    ### ---- Baseline features arrays ---- ####
    # number of overlapping words in title
    overlap_title = []
    # temporal distance between the papers
    temp_diff = []
    # number of common authors
    comm_auth = []
    
    ### ---- Generated features arrays ---- ####
    # WMD
    wmd = []
    # number of references for the source or the target
    num_references_source = []
    num_references_target = []
    # number of common neighbors
    num_common_neighbors = []
    # number of keywords: graph of words
    # num_keywords_graph_of_words = []
    # TF_IDF
    pairwise_similarity_number = []
    
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    print ("Third step - start to calculate features.")
    
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]
        edge = information_set[i][2]

        index_source = IDs.index(source)
        index_target = IDs.index(target)

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_title = clean(source_info[2], stemmer, stpwds)
        target_title = clean(target_info[2], stemmer, stpwds)

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")

        source_abstract = clean(source_info[5], stemmer, stpwds)
        target_abstract = clean(target_info[5], stemmer, stpwds)

        # Baseline features
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[1]) - int(target_info[1]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        # Generated features
        wmd.append(w2v.wv.wmdistance(source_abstract, target_abstract))
        num_references_source.append(degrees[index_source])
        num_references_target.append(degrees[index_target])
        num_common_neighbors.append(len(neighbors_list[index_source].intersection(neighbors_list[index_target])))
        # num_keywords_graph_of_words.append(len(set(keywords_graph_of_words(source_abstract)).intersection(set(keywords_graph_of_words(target_abstract)))))
        pairwise_similarity_number.append(pairwise_similarity[index_source, index_target])

        counter += 1
        if counter % 1000 == 0:
            print (counter, "examples processed")
            
    print ("3 step - All features calculated")

    #### ---- Final features array ---- ####
    list_of_features = []
    list_of_features.append(overlap_title)
    list_of_features.append(temp_diff)
    list_of_features.append(comm_auth)
    list_of_features.append(wmd)
    list_of_features.append(num_references_source)
    list_of_features.append(num_references_target)
    list_of_features.append(num_common_neighbors)
    # list_of_features.append(num_keywords_graph_of_words)
    list_of_features.append(pairwise_similarity_number)
    
    return list_of_features

In [17]:
def forth_step(information_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v, graph, TEST):
    # shortest path between the nodes of abstract graph
    shortest_path = []
    edge_connectivity = []
    
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    print ("Forth step - start to calculate features.")
        
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]
        if TEST :
            edge = 0
        else:
            edge = information_set[i][2]

        shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target)

        counter += 1
        if counter % 100 == 0:
            print (counter, "examples processed")
    
    print ("4 step - All features calculated")

    #### ---- Final features array ---- ####
    list_of_features.append(shortest_path)
    list_of_features.append(edge_connectivity)
        
    return list_of_features

## Part 4.

Calculate w2v one time for training and testing set.

In [18]:
# W2V
w2v = build_w2v(node_info, stemmer, stpwds)

Word2Vec model loaded


Calculate features and save it in files.

### Commun part for trainig and testing set

In [35]:
#### ---- Preparation functions ---- ####

## First step ##
print ("Length of set : ", len(training_set))
authLinks = {}
# Authors link dictionary
authLinks = count_authLinksStoT(training_set, node_info)
print ("Dictonary authLinks created.")

Length of set :  615512
10000 examples processed
20000 examples processed
30000 examples processed
40000 examples processed
50000 examples processed
60000 examples processed
70000 examples processed
80000 examples processed
90000 examples processed
100000 examples processed
110000 examples processed
120000 examples processed
130000 examples processed
140000 examples processed
150000 examples processed
160000 examples processed
170000 examples processed
180000 examples processed
190000 examples processed
200000 examples processed
210000 examples processed
220000 examples processed
230000 examples processed
240000 examples processed
250000 examples processed
260000 examples processed
270000 examples processed
280000 examples processed
290000 examples processed
300000 examples processed
310000 examples processed
320000 examples processed
330000 examples processed
340000 examples processed
350000 examples processed
360000 examples processed
370000 examples processed
380000 examples process

In [19]:
graph = first_step (training_set, node_info, IDs)

Edges and nodes for undirected graph prepared.
Undirected graph created.
First step finished.



### 1. Training set

In [20]:
print("Train features calculating.")

Train features calculating.


In [38]:
## Second step ##
avg_auth_train = second_step (training_set, IDs, node_info, stemmer, stpwds, g)
print("Average links calculated.")

Second step - start to calculate features.
10000 examples processed
20000 examples processed
30000 examples processed
40000 examples processed
50000 examples processed
60000 examples processed
70000 examples processed
80000 examples processed
90000 examples processed
100000 examples processed
110000 examples processed
120000 examples processed
130000 examples processed
140000 examples processed
150000 examples processed
160000 examples processed
170000 examples processed
180000 examples processed
190000 examples processed
200000 examples processed
210000 examples processed
220000 examples processed
230000 examples processed
240000 examples processed
250000 examples processed
260000 examples processed
270000 examples processed
280000 examples processed
290000 examples processed
300000 examples processed
310000 examples processed
320000 examples processed
330000 examples processed
340000 examples processed
350000 examples processed
360000 examples processed
370000 examples processed
3800

In [21]:
features_train= []
#features_train.append(avg_auth_train)

NameError: name 'avg_auth_train' is not defined

In [None]:
## Third step ##
features_train = third_step (training_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v)

Third step - start to calculate features.
1000 examples processed
2000 examples processed
3000 examples processed
4000 examples processed
5000 examples processed
6000 examples processed
7000 examples processed
8000 examples processed
9000 examples processed
10000 examples processed
11000 examples processed
12000 examples processed
13000 examples processed
14000 examples processed
15000 examples processed
16000 examples processed
17000 examples processed
18000 examples processed
19000 examples processed
20000 examples processed
21000 examples processed
22000 examples processed
23000 examples processed
24000 examples processed
25000 examples processed
26000 examples processed
27000 examples processed
28000 examples processed
29000 examples processed
30000 examples processed
31000 examples processed
32000 examples processed
33000 examples processed
34000 examples processed
35000 examples processed
36000 examples processed
37000 examples processed
38000 examples processed
39000 examples pr

In [57]:
## Forth step ##
features_4step = forth_step(training_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v, graph, 0)

Forth step - start to calculate features.
100 examples processed
200 examples processed
300 examples processed
400 examples processed
500 examples processed
600 examples processed


ValueError: no edge from vertex #21937 to #17814

In [None]:
## Put all features together ##
print("Putting all features together...")
features_train.append(features_4step)

In [62]:
# Convert list of lists into array
# Documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array(features_train).T

In [67]:
# Scale
#training_features = preprocessing.scale(training_features)
np.save(path_to_data + 'avg_auth_train.npy', training_features)

### 2. Testing set 

In [None]:
print("\n\n Test features calculating.")

In [73]:
## Second step ##
avg_auth_test = second_step (testing_set, IDs, node_info, stemmer, stpwds, g)
print("Average links for test calculated.")

Second step - start to calculate features.
10000 examples processed
20000 examples processed
30000 examples processed
2 step - All features calculated
Average links for test calculated.


In [75]:
features_test= []
features_test.append(avg_auth_test)

In [None]:
## Third step ##
features_test = third_step (testing_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v)

In [None]:
## Forth step ##
features_4step_test = forth_step(testing_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v, graph, 1)

In [None]:
## Put all features together ##
print("Putting all features together...")
features_test.append(features_4step_test)

In [76]:
# Convert list of lists into array
# Documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array(features_test).T
# Scale
#testing_features = preprocessing.scale(testing_features)

In [77]:
np.save(path_to_data + 'avg_auth_test.npy', testing_features)