# Predicting links in Social Networks

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.utils import resample
import pickle

## Load data

### Like relation

In [2]:
like_1 = pd.read_csv("data/engineered_features/Like_Features_day_1.csv")
like_2 = pd.read_csv("data/engineered_features/Like_Features_day_2.csv")
like_3 = pd.read_csv("data/engineered_features/Like_Features_day_3.csv")
like_4 = pd.read_csv("data/engineered_features/Like_Features_day_4.csv")
like_5 = pd.read_csv("data/engineered_features/Like_Features_day_5.csv")

In [3]:
common_1 = pd.read_csv("data/engineered_features/common_interest_day_1.csv")
common_2 = pd.read_csv("data/engineered_features/common_interest_day_2.csv")
common_3 = pd.read_csv("data/engineered_features/common_interest_day_3.csv")
common_3 = pd.read_csv("data/engineered_features/common_interest_day_4.csv")
common_1_3 = pd.read_csv("data/engineered_features/common_interest_combined_day_1_3.csv")

### Group Relation

In [4]:
with open("data/engineered_features/G_features_postings_days_1-3.pkl", "rb") as input_file:
    G_1_3 = pickle.load(input_file)

### Posting TFIDF + graph features

In [5]:
with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_days_1to3.pkl", "rb") as input_file:
    tfidf_pos_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_day_4_test.pkl",
          "rb") as input_file:
    tfidf_pos_4_test = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_days_1to3.pkl", "rb") as input_file:
    tfidf_neg_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_day_4_test.pkl",
          "rb") as input_file:
    tfidf_neg_4_test = pickle.load(input_file)

## Process data for ML

### Downsample data

In [6]:
TEST_SET_DOWN_FACTOR = 0.3
TRAIN_SET_DOWN_FACTOR = 0.1
TRAIN_SET_NEG_OVERBALANCE = 2

Because of the huge number of nodes execution of basic data handling takes long on commodity hardware. Therefore the data sets need to be reduced in size while keeping the characteristics of the orignal data set

#### For the common interest relationship

In [7]:
com = common_1_3
com["label"] = [1 if c > 0 else 0 for c in common_1_3["weight"]]

In [8]:
com_all_downsample = resample(
    com,
    n_samples=int(com.shape[0] * TEST_SET_DOWN_FACTOR)
)

com_neg = com[com.label == 0]
com_pos = com[com.label == 1]

com_neg_downsample = resample(com_neg,
                              n_samples=int(com_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                              )

com_resampled = resample(
    pd.concat([com_pos, com_neg_downsample]),
    n_samples=int(com_pos.shape[0] * 2 * TRAIN_SET_DOWN_FACTOR)
)

In [9]:
com_resampled.shape[0]

20139

The resampled data structure still contains over 200k links (not considering train set downscaling), but now the label imbalance is corrected.

#### For the like relationship

In [10]:
like_all = pd.concat([like_1, like_2, like_3, like_4, like_5])
like_all["label"] = [1 if c > 0 else 0 for c in like_all["current_weight"]]

In [11]:
like_all_downsampled = resample(
    like_all,
    n_samples=int(like_all.shape[0] * TEST_SET_DOWN_FACTOR)
)

like_neg = like_all[like_all.label == 0]
like_pos = like_all[like_all.label == 1]

like_neg_downsampled = resample(like_neg,
                                n_samples=int(like_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                                )
like_resampled = resample(
    pd.concat([like_pos, like_neg_downsampled]),
    n_samples=int(like_pos.shape[0] * 2 * TRAIN_SET_DOWN_FACTOR)
)

In [12]:
like_resampled.shape[0]

41171

The resampled data structure still contains over 400k (not considering train set downscaling) links, but now the label imbalance is corrected.

#### For group relation using TFIDF


In [13]:
tfidf_all_train = pd.concat([tfidf_neg_1_3, tfidf_pos_1_3])
tfidf_all_test = pd.concat([tfidf_neg_4_test, tfidf_pos_4_test])

In [14]:
tfidf_all_train_downsampled = resample(
    tfidf_all_train,
    n_samples=int(tfidf_all_train.shape[0] * TEST_SET_DOWN_FACTOR)
)

tfidf_train_neg_downsampled = resample(tfidf_neg_1_3,
                                       n_samples=int(tfidf_pos_1_3.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                                       )

tfidf_train_resampled = resample(
    pd.concat([tfidf_pos_1_3, tfidf_train_neg_downsampled]),
    n_samples=int(tfidf_pos_1_3.shape[0] * 2 * TRAIN_SET_DOWN_FACTOR)
)

In [15]:
tfidf_train_resampled.shape[0]

9119

The resampled data structure still contains over 90k (not considering train set downscaling) links, but now the label imbalance is corrected.

## Define columns for training

In [16]:
target = "label"
graph_features = [
    "jaccard_coef",
    "adamic_adar_index",
    "preferential_attachment_index",
    "clustering_coefficient_score"
]
larger_graph_features = [
    "jaccard_coef",
    "adamic_adar_index",
    "preferential_attachment_index",
    "clustering_coefficient_score_Source_User",
    "clustering_coefficient_score_Target_User",
    "pagerank_Source_User",
    "pagerank_Target_User"
]
tfidf_features = [ f'{i}_TFIDF' for i in range(0,500)]


X_train_com = com_resampled[graph_features]
y_train_com = com_resampled[target]

X_train_like = like_resampled[graph_features]
y_train_like = like_resampled[target]

X_train_tfidf = tfidf_train_resampled[larger_graph_features + tfidf_features]
y_train_tfidf = tfidf_train_resampled[target]

In [17]:
rfc_com = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_com = SVC()
knn_com = KNeighborsClassifier(n_neighbors=10)

rfc_like = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_like = SVC()
knn_like = KNeighborsClassifier(n_neighbors=10)

rfc_tfidf = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_tfidf = SVC()
knn_tfidf = KNeighborsClassifier(n_neighbors=10)

## Define evaluation function

In [18]:
def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall", "F1"],
        "Score": [accuracy_score(actual, predictions),
                  precision_score(actual, predictions),
                  recall_score(actual, predictions),
                  f1_score(actual, predictions)]
    })

## Fit classifiers

### RF

In [19]:
rfc_com.fit(X_train_com, y_train_com)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

In [20]:
rfc_like.fit(X_train_like, y_train_like)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

In [21]:
rfc_tfidf.fit(X_train_tfidf, y_train_tfidf)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

### SVM

In [22]:
svm_com.fit(X_train_com, y_train_com)

SVC()

In [23]:
svm_like.fit(X_train_like, y_train_like)

SVC()

In [24]:
svm_tfidf.fit(X_train_tfidf, y_train_tfidf)

SVC()

### KNN

In [25]:
knn_com.fit(X_train_com, y_train_com)

KNeighborsClassifier(n_neighbors=10)

In [26]:
knn_like.fit(X_train_like, y_train_like)

KNeighborsClassifier(n_neighbors=10)

In [27]:
knn_tfidf.fit(X_train_tfidf, y_train_tfidf)

KNeighborsClassifier(n_neighbors=10)

## Evaluate predictions

In [28]:
X_test_com = com_all_downsample[graph_features]
y_test_com = com_all_downsample[target]

In [29]:
X_test_like = like_all_downsampled[graph_features]
y_test_like = like_all_downsampled[target]

In [30]:
X_test_tfidf = tfidf_all_test[larger_graph_features + tfidf_features]
y_test_tfidf = tfidf_all_test[target]

### RF evaluation

In [31]:
predictions = rfc_com.predict(X_test_com)
evaluate_model(predictions, y_test_com)

Unnamed: 0,Measure,Score
0,Accuracy,0.973799
1,Precision,0.157521
2,Recall,0.971064
3,F1,0.27107


In [32]:
predictions = rfc_like.predict(X_test_like)
evaluate_model(predictions, y_test_like)

Unnamed: 0,Measure,Score
0,Accuracy,0.865941
1,Precision,0.033157
2,Recall,0.529607
3,F1,0.062407


In [33]:
predictions = rfc_tfidf.predict(X_test_tfidf)
evaluate_model(predictions, y_test_tfidf)

Unnamed: 0,Measure,Score
0,Accuracy,0.713041
1,Precision,0.436803
2,Recall,0.50573
3,F1,0.468746


### SVM evaluation

In [34]:
predictions = svm_com.predict(X_test_com)
evaluate_model(predictions, y_test_com)

Unnamed: 0,Measure,Score
0,Accuracy,0.960749
1,Precision,0.09588
2,Recall,0.809486
3,F1,0.171452


In [35]:
predictions = svm_like.predict(X_test_like)
evaluate_model(predictions, y_test_like)

Unnamed: 0,Measure,Score
0,Accuracy,0.92605
1,Precision,0.024219
2,Recall,0.197969
3,F1,0.043158


In [36]:
predictions = svm_tfidf.predict(X_test_tfidf)
evaluate_model(predictions, y_test_tfidf)

Unnamed: 0,Measure,Score
0,Accuracy,0.708759
1,Precision,0.43023
2,Recall,0.503934
3,F1,0.464175


### KNN evaluation

In [37]:
predictions = knn_com.predict(X_test_com)
evaluate_model(predictions, y_test_com)

Unnamed: 0,Measure,Score
0,Accuracy,0.945334
1,Precision,0.074762
2,Recall,0.869953
3,F1,0.137691


In [38]:
predictions = knn_like.predict(X_test_like)
evaluate_model(predictions, y_test_like)

Unnamed: 0,Measure,Score
0,Accuracy,0.900739
1,Precision,0.023845
2,Recall,0.269986
3,F1,0.043819


In [39]:
predictions = knn_tfidf.predict(X_test_tfidf)
evaluate_model(predictions, y_test_tfidf)

Unnamed: 0,Measure,Score
0,Accuracy,0.695014
1,Precision,0.412754
2,Recall,0.516507
3,F1,0.458838


# TODO

- Split according to days (one day for test one for verfication?)
- Combine different features
- Reformulate learning taks to predict the next day
- Grid search for params
- Cross validtation
- Explainability?


See this [repo](https://github.com/neo4j-examples/link-prediction) for reference