# Feature engineering process Waldor

## Part 1.

### Import libraries that we will use

In [1]:
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from gensim.models.word2vec import Word2Vec
import nltk
import csv
import igraph
import math

from read_data import *
from graph_creation import *



### Initialization like in public_baseline

In [2]:
# ---First Initializations--- #
path_to_data = "../data/"
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

# ---Read Data--- #
testing_set, training_set, node_info = read_data()
IDs = [element[0] for element in node_info]

# ---Compute TFIDF vector of each paper--- #
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)
pairwise_similarity = features_TFIDF * features_TFIDF.T
#print pairwise_similarity.shape
# ---Create graph--- #
g = create_graph(training_set, IDs)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
creating graph
graph created


## Part 2.

## Additional to public_baseline graph features 

### 1) Word2Vec

In [4]:
def build_w2v(node_info, stemmer, stpwds):
    try:
        model = Word2Vec.load("w2v_model")
        print ("Word2Vec model loaded")
    except:
        path_to_google_news = '../data'
        my_q = 300 # to match dim of GNews word vectors
        mcount = 5
        model = Word2Vec(size=my_q, min_count=mcount)
        cleaned_abstracts = [clean(element[5], stemmer, stpwds) for element in node_info]
        print ("Building Word2Vec vocab...")
        model.build_vocab(cleaned_abstracts)
        print ("Loading intersect vectors...")
        model.intersect_word2vec_format(path_to_google_news + 'GoogleNews-vectors-negative300.bin.gz', binary=True)
        model.save("w2v_model")
        print ("Model saved to disk")
    return model

### 2) avg_number_citations_of_authors

The function to create a dictionary where we put the number of references from author1 to author2.

Example if three references (three articles):

    (author1, author2) : 3

In [19]:
# Only one time
def count_authLinksStoT (information_set, node_info):
    authLinks = {}
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")
        
        for s in source_auth:
            s.replace(' ', '')
        for t in target_auth:
            t.replace(' ', '')
        
        for s in source_auth:
            for t in target_auth:
                key = (s,t)
                if key in authLinks:
                    authLinks[key] += 1
                else:
                    authLinks[key] = 1
    return authLink

Count the average number of citations the authors of target have received from authors of source FOR ONE RECORD.

In [6]:
def avg_number_citations(avg_number_citations_of_authors, source_auth, target_auth, authLinks):
    summ = 0
    count = 0
    for s in source_auth:
        for t in target_auth:
            key = (s,t)
            if key in authLinks:
                summ += authLinks[key]
                count += 1
    if count == 0:
        avg_number_citations_of_authors.append(0)
    else:
        avg_number_citations_of_authors.append(summ/count)
    return

### 3) shortest_path and edge_connectivity

We create the full NON directed graph of relations between the abstracts to calculate the shortest path and the edge sonnectivity using igraph.

In [8]:
# Only one time
def undirected_graph(information_set, IDs):
    edges = [(element[0],element[1]) for element in information_set if element[2]=="1"]
    nodes = IDs

    graph = igraph.Graph(directed=False)
    graph.add_vertices(nodes)
    graph.add_edges(edges)

    return graph

The function to calculate a shortest edge FOR ONE RECORD.

In [10]:
def shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target):
    if int(edge) == 1:
        graph.delete_edges([(source,target)])
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)
        graph.add_edges([(source,target)])
    else:
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)

## Part 3.

## Feature engineering

Clean function from feture_engineering.py

In [11]:
def clean(s, stemmer, stpwds):
    s = s.lower().split(" ")
    s = [token for token in s if token not in stpwds]
    s = [stemmer.stem(token) for token in s]
    s = [''.join([elt for elt in token if not elt.isdigit()]) for token in s] # remove digits
    s = [token for token in s if len(token)>2] # remove tokens shorter than 3 characters in size
    s = [token for token in s if len(token)<=25] # remove tokens exceeding 25 characters in size
    return s

Feature_engineering function.

In [16]:
def feature_engineering1step(information_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v):
    
    ### ---- Baseline features arrays ---- ####
    # number of overlapping words in title
    overlap_title = []
    # temporal distance between the papers
    temp_diff = []
    # number of common authors
    comm_auth = []
    
    ### ---- Generated features arrays ---- ####
    # WMD
    wmd = []
    # number of references for the source or the target
    num_references_source = []
    num_references_target = []
    # number of common neighbors
    num_common_neighbors = []
    # number of keywords: graph of words
    # num_keywords_graph_of_words = []
    # TF_IDF
    pairwise_similarity_number = []
    # the average number of citations the authors of target have received from authors of source
    avg_number_citations_of_authors = []  
    # shortest path between the nodes of abstract graph
    shortest_path = []
    
    #### ---- Preparation functions ---- ####
    # Authors link dictionary
    authLinks = count_authLinksStoT(information_set, node_info)
    # Undirected graph to calculate shortest_path and edge_connectivity
    graph = undirected_graph(information_set, IDs)
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]
        edge = information_set[i][2]

        index_source = IDs.index(source)
        index_target = IDs.index(target)

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_title = clean(source_info[2], stemmer, stpwds)
        target_title = clean(target_info[2], stemmer, stpwds)

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")

        source_abstract = clean(source_info[5], stemmer, stpwds)
        target_abstract = clean(target_info[5], stemmer, stpwds)

        # Baseline features
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info[1]) - int(target_info[1]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        # Generated features
        wmd.append(w2v.wv.wmdistance(source_abstract, target_abstract))
        num_references_source.append(degrees[index_source])
        num_references_target.append(degrees[index_target])
        num_common_neighbors.append(len(neighbors_list[index_source].intersection(neighbors_list[index_target])))
        avg_number_citations(avg_number_citations_of_authors, source_auth, target_auth, authLinks)
        # num_keywords_graph_of_words.append(len(set(keywords_graph_of_words(source_abstract)).intersection(set(keywords_graph_of_words(target_abstract)))))
        pairwise_similarity_number.append(pairwise_similarity[index_source, index_target])
        shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target)

        counter += 1
        if counter % 1000 == 0:
            print (counter, "examples processed")
            
    print ("All features calculated")

    #### ---- Final features array ---- ####
    list_of_features = []
    list_of_features.append(overlap_title)
    list_of_features.append(temp_diff)
    list_of_features.append(comm_auth)
    list_of_features.append(wmd)
    list_of_features.append(num_references_source)
    list_of_features.append(num_references_target)
    list_of_features.append(num_common_neighbors)
    list_of_features.append(avg_number_citations_of_authors)
    # list_of_features.append(num_keywords_graph_of_words)
    list_of_features.append(pairwise_similarity_number)
    list_of_features.append(shortest_path)
    list_of_features.append(edge_connectivity)
    
    # Convert list of lists into array
    # Documents as rows, unique words as columns (i.e., example as rows, features as columns)
    features = np.array(list_of_features).T
    # Scale
    features = preprocessing.scale(features)
    
    return features

In [None]:
def feature_engineering2step(information_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v):
    # shortest path between the nodes of abstract graph
    shortest_path = []
    edge_connectivity = []
    
    #### ---- Preparation functions ---- ####
    # Authors link dictionary
    authLinks = count_authLinksStoT(information_set, node_info)
    # Undirected graph to calculate shortest_path and edge_connectivity
    graph = undirected_graph(information_set, IDs)
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]
        edge = information_set[i][2]

        index_source = IDs.index(source)
        index_target = IDs.index(target)

        source_info = [element for element in node_info if element[0] == source][0]
        target_info = [element for element in node_info if element[0] == target][0]

        source_title = clean(source_info[2], stemmer, stpwds)
        target_title = clean(target_info[2], stemmer, stpwds)

        source_auth = source_info[3].split(",")
        target_auth = target_info[3].split(",")

        source_abstract = clean(source_info[5], stemmer, stpwds)
        target_abstract = clean(target_info[5], stemmer, stpwds)

        shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target)

        counter += 1
        if counter % 1000 == 0:
            print (counter, "examples processed")
            
    print ("All features calculated")

    #### ---- Final features array ---- ####
    list_of_features = []
    list_of_features.append(overlap_title)
    list_of_features.append(temp_diff)
    list_of_features.append(comm_auth)
    list_of_features.append(wmd)
    list_of_features.append(num_references_source)
    list_of_features.append(num_references_target)
    list_of_features.append(num_common_neighbors)
    list_of_features.append(avg_number_citations_of_authors)
    # list_of_features.append(num_keywords_graph_of_words)
    list_of_features.append(pairwise_similarity_number)
    list_of_features.append(shortest_path)
    list_of_features.append(edge_connectivity)
    
    # Convert list of lists into array
    # Documents as rows, unique words as columns (i.e., example as rows, features as columns)
    features = np.array(list_of_features).T
    # Scale
    features = preprocessing.scale(features)
    
    return features

## Part 4.

Calculate w2v one time for training and testing set.

In [17]:
# W2V
w2v = build_w2v(node_info, stemmer, stpwds)

Word2Vec model loaded


Calculate features and save it in files.

In [None]:
training_features = feature_engineering(training_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v)
np.save(path_to_data + 'training_features_test_parted.npy', training_features)

In [None]:
testing_features = feature_engineering(testing_set, IDs, node_info, stemmer, stpwds, g, pairwise_similarity, w2v)
np.save(path_to_data + 'testing_features_test_parted.npy', testing_features)