# Predicting links in Social Networks

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.utils import resample
import pickle

## Load data

### Like relation

In [2]:
like_1 = pd.read_csv("data/engineered_features/Like_Features_day_1.csv")
like_2 = pd.read_csv("data/engineered_features/Like_Features_day_2.csv")
like_3 = pd.read_csv("data/engineered_features/Like_Features_day_3.csv")
like_4 = pd.read_csv("data/engineered_features/Like_Features_day_4.csv")
like_5 = pd.read_csv("data/engineered_features/Like_Features_day_5.csv")

In [3]:
common_1 = pd.read_csv("data/engineered_features/common_interest_day_1.csv")
common_2 = pd.read_csv("data/engineered_features/common_interest_day_2.csv")
common_3 = pd.read_csv("data/engineered_features/common_interest_day_3.csv")
common_3 = pd.read_csv("data/engineered_features/common_interest_day_4.csv")
common_1_3 = pd.read_csv("data/engineered_features/common_interest_combined_day_1_3.csv")

### Group Relation

In [11]:
with open("data/engineered_features/G_features_postings_days_1-3.pkl", "rb") as input_file:
    G_1_3 = pickle.load(input_file)

### Posting TFIDF

In [7]:
with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_days_1to3.pkl", "rb") as input_file:
    tfidf_pos_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_day_4_test.pkl",
          "rb") as input_file:
    tfidf_pos_4_test = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_days_1to3.pkl", "rb") as input_file:
    tfidf_neg_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_day_4_test.pkl",
          "rb") as input_file:
    tfidf_neg_4_test = pickle.load(input_file)

## Process data for ML

### Downsample data

Because of the huge number of nodes execution of basic data handling takes long on commodity hardware. Therefore the data sets need to be reduced in size while keeping the characteristics of the orignal data set

In [62]:
com = common_1_3
com["label"] = [1 if c > 0 else 0 for c in common_1_3["weight"]]

In [63]:
com_all_downsample = resample(
    com,
    n_samples=int(com.shape[0] * 0.1)
)

com_neg = com[com.label == 0]
com_pos = com[com.label == 1]

com_neg_downsample = resample(com_neg,
                              n_samples=com_pos.shape[0])

com_resampled = pd.concat([com_pos, com_neg_downsample])

In [64]:
com_resampled.shape[0]

201398

The resampled data structure still contains over 200k links, but now the label imbalance is corrected.

## Define columns for training

In [65]:
target = "label"
columns = [
    "jaccard_coef",
    "adamic_adar_index",
    "preferential_attachment_index",
    "clustering_coefficient_score"
]

X_train = com_resampled[columns]
y_train = com_resampled[target]

In [69]:
rfc = classifier = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)

## Define evaluation function

In [75]:
def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall", "F1"],
        "Score": [accuracy_score(actual, predictions),
                  precision_score(actual, predictions),
                  recall_score(actual, predictions),
                  f1_score(actual, predictions)]
    })

## Fit classifiers

In [76]:
classifier.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=30, random_state=0)

## Evaluate prediction

In [77]:
X_test = com_all_downsample[columns]
y_test = com_all_downsample[target]

In [78]:
predictions = classifier.predict(X_test)
evaluate_model(predictions, y_test)

Unnamed: 0,Measure,Score
0,Accuracy,0.966436
1,Precision,0.128755
2,Recall,0.986828
3,F1,0.227789


# TODO

- Add engineered features
- Split according to days (one day for test one for verfication?)
- More models
- Grid search for params
- Cross validtation
- Explainability?


See this [repo](https://github.com/neo4j-examples/link-prediction) for reference