# Feature engineering process Waldor

## Part 1.

### Import libraries that we will use

In [1]:
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from gensim.models.word2vec import Word2Vec
import nltk
import csv
import igraph
import math

from read_data import *
from graph_creation import *



### Initialization like in public_baseline

In [3]:
# ---First Initializations--- #
path_to_data = "../data/"
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

# ---Read Data--- #
testing_set, training_set, node_info = read_data()
IDs = [element[0] for element in node_info]

# ---Create graph--- #
g = create_graph(training_set, IDs)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dinar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
creating graph
graph created


## Part 2.

## Additional to public_baseline graph features 

### 3) shortest_path and edge_connectivity

We create the full NON directed graph of relations between the abstracts to calculate the shortest path and the edge sonnectivity using igraph.

In [17]:
# Only one time
def undirected_graph(information_set, IDs):
    edges = [(element[0], element[1]) for element in information_set if element[2]=="1"]
    print(edges[0], edges[100], edges[100000])
    nodes = IDs

    print ("Edges and nodes for undirected graph prepared.")
    graph = igraph.Graph(directed=False)
    graph.add_vertices(nodes)
    graph.add_edges(edges)

    return graph

The function to calculate a shortest edge FOR ONE RECORD.

In [5]:
def shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target):
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    if (edge == "1"):
        graph.delete_edges([(source,target)])
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)
        graph.add_edges([(source,target)])
    else:
        val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
        edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
        shortest_path.append(val)

## Part 3.

## Feature engineering

Clean function from feture_engineering.py

In [6]:
def clean(s, stemmer, stpwds):
    s = s.lower().split(" ")
    s = [token for token in s if token not in stpwds]
    s = [stemmer.stem(token) for token in s]
    s = [''.join([elt for elt in token if not elt.isdigit()]) for token in s] # remove digits
    s = [token for token in s if len(token)>2] # remove tokens shorter than 3 characters in size
    s = [token for token in s if len(token)<=25] # remove tokens exceeding 25 characters in size
    return s

Feature_engineering function.

In [7]:
#def first_step (information_set, node_info, IDs) :
    # Undirected graph to calculate shortest_path and edge_connectivity
 #   graph = undirected_graph(information_set, IDs)
  #  print ("Undirected graph created.")
   # print ("First step finished.\n")
    #return graph

In [87]:
edges = [(element[0], element[1]) for element in training_set if element[2]=="1"]
print(edges[0], edges[100], edges[100000])
nodes = IDs

print ("Edges and nodes for undirected graph prepared.")
graph = igraph.Graph(directed=False)
graph.add_vertices(nodes)
graph.add_edges(edges)

print(len(graph.vs), len(graph.es))

('9510123', '9502114') ('205265', '105095') ('101119', '1082')
Edges and nodes for undirected graph prepared.
27770 335130


In [84]:
len(edges)

335130

In [88]:
def forth_step(information_set, IDs, node_info, stemmer, stpwds, g, TEST):
    # shortest path between the nodes of abstract graph
    shortest_path = []
    edge_connectivity = []
    
    # More useful variables
    counter = 0
    degrees = g.degree(IDs)
    neighbors_list = []
    for id in IDs:
        neighbors_list.append(set(g.neighbors(id)))
        
    print ("Forth step - start to calculate features.")
        
    #### ---- Calculating features ---- ####
    for i in range(len(information_set)):
        source = information_set[i][0]
        target = information_set[i][1]
        if TEST :
            edge = 0
        else:
            edge = information_set[i][2]
            
        index_source = IDs.index(source)
        index_target = IDs.index(target)
        
     #   if source == '105155' and target == '9806044':
      #      counter += 1
       #     continue
            
        if (edge == "1"):
            graph.delete_edges([(index_source, index_target)])
            val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
            edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
            shortest_path.append(val)
            graph.add_edges([(source,target)])
        else:
            val = graph.shortest_paths_dijkstra(source=index_source, target=index_target)[0][0]
            edge_connectivity.append(graph.edge_disjoint_paths(source=index_source, target=index_target))
            shortest_path.append(val)

      ##  shortest_path_edge_connectivity(shortest_path, edge_connectivity, edge, graph, source, target)

        counter += 1
        if counter % 500 == 0:
            print ("\t\t\t", counter, "examples processed")
    
    print ("4 step - All features calculated")

    #### ---- Final features array ---- ####
    list_of_features.append(shortest_path)
    list_of_features.append(edge_connectivity)
        
    return list_of_features

## Part 4.

Calculate features and save it in files.

### Commun part for trainig and testing set

In [18]:
#graph = first_step (training_set, node_info, IDs)

('9510123', '9502114') ('205265', '105095') ('101119', '1082')
Edges and nodes for undirected graph prepared.
Undirected graph created.
First step finished.



### 1. Training set

In [19]:
print("Train features calculating.")

Train features calculating.


In [20]:
features_train= []

In [89]:
## Forth step ##
features = forth_step(training_set, IDs, node_info, stemmer, stpwds, g, 0)

Forth step - start to calculate features.
9510123 9502114 1
9707075 9604178 1
9312155 9506142 0
9911255 302165 0
9701033 209076 0
9710020 9709228 1
9901042 9510135 1
209146 9502077 0
9705079 9702201 1


KeyboardInterrupt: 

In [None]:
# Convert list of lists into array
# Documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array(features).T

In [67]:
# Scale
#training_features = preprocessing.scale(training_features)
np.save(path_to_data + 'shortest_path_edge_connectivity_train.npy', training_features)

### 2. Testing set 

In [None]:
print("Test features calculating.")

In [75]:
features_test= []

In [None]:
## Forth step ##
features_test = forth_step(testing_set, IDs, node_info, stemmer, stpwds, g, graph, 1)

In [76]:
# Convert list of lists into array
# Documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array(features_test).T
# Scale
#testing_features = preprocessing.scale(testing_features)

In [77]:
np.save(path_to_data + 'shortest_path_edge_connectivity_test.npy', testing_features)