# Predicting links in Social Networks

In [1]:
import itertools

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pickle

## Load data

### Like relation

In [2]:
like_1 = pd.read_csv("data/engineered_features/Like_Features_day_1.csv")
like_2 = pd.read_csv("data/engineered_features/Like_Features_day_2.csv")
like_3 = pd.read_csv("data/engineered_features/Like_Features_day_3.csv")
like_4 = pd.read_csv("data/engineered_features/Like_Features_day_4.csv")
like_5 = pd.read_csv("data/engineered_features/Like_Features_day_5.csv")

In [3]:
common_1 = pd.read_csv("data/engineered_features/common_interest_day_1.csv")
common_2 = pd.read_csv("data/engineered_features/common_interest_day_2.csv")
common_3 = pd.read_csv("data/engineered_features/common_interest_day_3.csv")
common_4 = pd.read_csv("data/engineered_features/common_interest_day_4.csv")
common_1_3 = pd.read_csv("data/engineered_features/common_interest_combined_day_1_3.csv")

### Group Relation

In [4]:
with open("data/engineered_features/G_features_postings_days_1-3.pkl", "rb") as input_file:
    G_1_3 = pickle.load(input_file)

### Posting TFIDF + graph features

In [5]:
with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_days_1to3.pkl", "rb") as input_file:
    tfidf_pos_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_positive_day_4_test.pkl",
          "rb") as input_file:
    tfidf_pos_4_test = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_days_1to3.pkl", "rb") as input_file:
    tfidf_neg_1_3 = pickle.load(input_file)

with open("data/engineered_features/postings_tfidf_and_graph_features_only_negative_day_4_test.pkl",
          "rb") as input_file:
    tfidf_neg_4_test = pickle.load(input_file)

## Process data for ML

### Downsample data

Experiment paramters:

In [6]:
TRAIN_SET_SPLIT_SIZE = 0.8
TEST_SET_DOWN_FACTOR = 0.3
TRAIN_SET_DOWN_FACTOR = 0.7
TRAIN_SET_NEG_OVERBALANCE = 2
PREDICTION_GOAL = "DAY_4"
#PREDICTION_GOAL = "SAME_DAY"

#Exludes long-running classifiers
QUICK_RUN = True

In [7]:
target = "label"
graph_features = [
    "jaccard_coef",
    "adamic_adar_index",
    "preferential_attachment_index",
    "clustering_coefficient_score_Source_User",
    "clustering_coefficient_score_Target_User",
    "pagerank_Source_User",
    "pagerank_Target_User"
]
tfidf_features = [ f'{i}_TFIDF' for i in range(0,500)]

Because of the huge number of nodes execution of basic data handling takes long on commodity hardware. Therefore the data sets need to be reduced in size while keeping the characteristics of the orignal data set

#### For the common interest relationship

In [8]:
if PREDICTION_GOAL == "SAME_DAY":
    com = common_1_3
    com["label"] = [1 if c > 0 else 0 for c in common_1_3["weight"]]
elif PREDICTION_GOAL == "DAY_4":
    com_4 = common_4[["source","target","weight"]]
    merged_com = pd.merge(common_1_3, com_4, on=["source","target"], how="left")
    merged_com.rename(columns= {"weight_y": "day_4_weight"}, inplace=True)
    merged_com['day_4_weight'] = merged_com['day_4_weight'].fillna(0)
    com = merged_com
    com["label"] = [1 if c > 0 else 0 for c in merged_com["day_4_weight"]]
    com.rename(columns= {"source": "Source_User", "target": "Target_User"}, inplace=True)

In [9]:
com_train, com_test = train_test_split(com, train_size=TRAIN_SET_SPLIT_SIZE)

com_test_downsample = resample(
    com_test,
    n_samples=int(com_test.shape[0] * TEST_SET_DOWN_FACTOR)
)

com_train_neg = com_train[com_train.label == 0]
com_train_pos = com_train[com_train.label == 1]

com_train_neg_downsample = resample(com_train_neg,
                              n_samples=int(com_train_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                              )

com_train_resampled = resample(
    pd.concat([com_train_pos, com_train_neg_downsample]),
    n_samples=int( (com_train_pos.shape[0] + com_train_neg_downsample.shape[0]) * TRAIN_SET_DOWN_FACTOR)
)

In [10]:
com_train_resampled.shape[0]

20796

The resampled data structure still contains over 200k links (not considering train set downscaling), but now the label imbalance is corrected.

#### For the like relationship

In [11]:
if PREDICTION_GOAL == "SAME_DAY":
    like_all = pd.concat([like_1, like_2, like_3, like_4, like_5])
    like_all["label"] = [1 if c > 0 else 0 for c in like_all["current_weight"]]
elif PREDICTION_GOAL == "DAY_4":
    like_1_3 = pd.concat([like_1, like_2, like_3])

    like_4 = like_4[["Source_User","Target_User","current_weight"]]
    merged_like = pd.merge(like_1_3, like_4, on=["Source_User","Target_User"], how="left")
    merged_like.rename(columns= {"current_weight_y": "day_4_weight"}, inplace=True)
    merged_like['day_4_weight'] = merged_like['day_4_weight'].fillna(0)
    like_all = merged_like
    like_all["label"] = [1 if c > 0 else 0 for c in like_all["day_4_weight"]]

In [12]:
like_train, like_test = train_test_split(like_all, train_size=TRAIN_SET_SPLIT_SIZE)


like_test_downsampled = resample(
    like_test,
    n_samples=int(like_test.shape[0] * TEST_SET_DOWN_FACTOR)
)

like_train_neg = like_train[like_train.label == 0]
like_train_pos = like_train[like_train.label == 1]

like_train_neg_downsampled = resample(like_train_neg,
                                n_samples=int(like_train_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                                )
like_train_resampled = resample(
    pd.concat([like_train_pos, like_train_neg_downsampled]),
    n_samples=int( (like_train_pos.shape[0] + like_train_neg_downsampled.shape[0]) * TRAIN_SET_DOWN_FACTOR)
)

In [13]:
like_train_resampled.shape[0]

43541

The resampled data structure still contains over 400k (not considering train set downscaling) links, but now the label imbalance is corrected.

#### For group relation using TFIDF


In [14]:
if PREDICTION_GOAL == "SAME_DAY":
    tfidf_all_train = pd.concat([tfidf_neg_1_3, tfidf_pos_1_3])
    tfidf_all_test = pd.concat([tfidf_neg_4_test, tfidf_pos_4_test])
    tfidf_all = pd.concat([tfidf_all_train, tfidf_all_test])
elif PREDICTION_GOAL == "DAY_4":
    tfidf_1_3 = pd.concat([tfidf_neg_1_3, tfidf_pos_1_3])
    tfidf_4 = pd.concat([tfidf_neg_4_test, tfidf_pos_4_test])

    tfidf_4 = tfidf_4[["Source_User","Target_User","label"]]

    merged_tfidf = pd.merge(tfidf_1_3, tfidf_4, on=["Source_User","Target_User"], how="left")
    merged_tfidf.rename(columns= {"label_y": "label"}, inplace=True)
    merged_tfidf['label'] = merged_tfidf['label'].fillna(0)
    tfidf_all = merged_tfidf

In [15]:
tfidf_train, tfidf_test = train_test_split(tfidf_all, train_size=TRAIN_SET_SPLIT_SIZE)


tfidf_test_downsampled = resample(
    tfidf_test,
    n_samples=int(tfidf_test.shape[0] * TEST_SET_DOWN_FACTOR)
)

tfidf_train_neg = tfidf_train[tfidf_train.label == 0]
tfidf_train_pos = tfidf_train[tfidf_train.label == 1]

tfidf_train_neg_downsampled = resample(tfidf_train_neg,
                                n_samples=int(tfidf_train_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                                )
tfidf_train_resampled = resample(
    pd.concat([tfidf_train_pos, tfidf_train_neg_downsampled]),
    n_samples=int( (tfidf_train_pos.shape[0] + tfidf_train_neg_downsampled.shape[0]) * TRAIN_SET_DOWN_FACTOR)
)

In [16]:
tfidf_train_resampled.shape[0]

5743

The resampled data structure still contains over 90k (not considering train set downscaling) links, but now the label imbalance is corrected.

## Combining the link types

Using the largest network (the **like** link) as a basis.

In [17]:
com.shape

(9718806, 13)

In [18]:
tfidf_all.shape

(184360, 513)

In [19]:
like_all.shape

(14669079, 14)

In [20]:
like_and_com = pd.merge(like_all, com, on=["Source_User","Target_User"], how="left", suffixes=("_like", "_com"))

In [21]:
selected_features = ["Source_User", "Target_User"] + graph_features + [target]
commented_all = tfidf_all[selected_features]
commented_all.rename(
    columns = dict(zip(selected_features[2:], [f + "_commented" for f in selected_features[2:]])),
    inplace=True
)
full_combined = pd.merge(like_and_com, commented_all, on=["Source_User","Target_User"], how="left")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Defining the new link

The **combined** link will be present if at least 1 out of the 3 links are present. If 2 out of 3 are chosen, less than 100 links will be present

In [22]:
full_combined["labels_sum"] = full_combined["label_com"] + full_combined['label_like'] + full_combined['label_commented']
full_combined["label"] = [1 if v >= 1 else 0 for v in full_combined["labels_sum"]]

In [23]:
full_combined["label"].sum()

1917

In [24]:
full_combined.fillna(0, inplace=True)
full_train, full_test = train_test_split(full_combined, train_size=TRAIN_SET_SPLIT_SIZE)


full_test_downsampled = resample(
    full_test,
    n_samples=int(full_test.shape[0] * TEST_SET_DOWN_FACTOR)
)

full_train_neg = full_train[full_train.label == 0]
full_train_pos = full_train[full_train.label == 1]

full_train_neg_downsampled = resample(full_train_neg,
                                n_samples=int(full_train_pos.shape[0] * TRAIN_SET_NEG_OVERBALANCE)
                                )
full_train_resampled = resample(
    pd.concat([full_train_pos, full_train_neg_downsampled]),
    n_samples=int( (full_train_pos.shape[0] + full_train_neg_downsampled.shape[0]) * TRAIN_SET_DOWN_FACTOR)
)

## Create training data

In [25]:
full_graph_features = list(itertools.chain.from_iterable( [[f + "_like", f + "_com", f + "_commented"] for f in graph_features]))

full_graph_features

['jaccard_coef_like',
 'jaccard_coef_com',
 'jaccard_coef_commented',
 'adamic_adar_index_like',
 'adamic_adar_index_com',
 'adamic_adar_index_commented',
 'preferential_attachment_index_like',
 'preferential_attachment_index_com',
 'preferential_attachment_index_commented',
 'clustering_coefficient_score_Source_User_like',
 'clustering_coefficient_score_Source_User_com',
 'clustering_coefficient_score_Source_User_commented',
 'clustering_coefficient_score_Target_User_like',
 'clustering_coefficient_score_Target_User_com',
 'clustering_coefficient_score_Target_User_commented',
 'pagerank_Source_User_like',
 'pagerank_Source_User_com',
 'pagerank_Source_User_commented',
 'pagerank_Target_User_like',
 'pagerank_Target_User_com',
 'pagerank_Target_User_commented']

In [26]:
X_train_com = com_train_resampled[graph_features]
y_train_com = com_train_resampled[target]

X_train_like = like_train_resampled[graph_features]
y_train_like = like_train_resampled[target]

X_train_tfidf = tfidf_train_resampled[graph_features + tfidf_features]
y_train_tfidf = tfidf_train_resampled[target]

X_train_full = full_train_resampled[full_graph_features]
y_train_full = full_train_resampled[target]

In [27]:
rfc_com = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_com = SVC()
knn_com = KNeighborsClassifier(n_neighbors=10)

rfc_like = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_like = SVC()
knn_like = KNeighborsClassifier(n_neighbors=10)

rfc_tfidf = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_tfidf = SVC()
knn_tfidf = KNeighborsClassifier(n_neighbors=10)

rfc_full = RandomForestClassifier(n_estimators=50, max_depth=40, random_state=0)
svm_full = SVC()
knn_full = KNeighborsClassifier(n_neighbors=10)

## Define evaluation function

In [28]:
def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall", "F1"],
        "Score": [accuracy_score(actual, predictions),
                  precision_score(actual, predictions),
                  recall_score(actual, predictions),
                  f1_score(actual, predictions)]
    }).transpose()

## Fit classifiers

### RF

In [29]:
rfc_com.fit(X_train_com, y_train_com)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

In [30]:
rfc_like.fit(X_train_like, y_train_like)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

In [31]:
rfc_tfidf.fit(X_train_tfidf, y_train_tfidf)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

In [32]:
rfc_full.fit(X_train_full, y_train_full)

RandomForestClassifier(max_depth=40, n_estimators=50, random_state=0)

### SVM

In [33]:
if not QUICK_RUN:
    svm_com.fit(X_train_com, y_train_com)

In [34]:
if not QUICK_RUN:
    svm_like.fit(X_train_like, y_train_like)

In [35]:
if not QUICK_RUN:
    svm_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [36]:
if not QUICK_RUN:
    svm_full.fit(X_train_full, y_train_full)

### KNN

In [37]:
knn_com.fit(X_train_com, y_train_com)

KNeighborsClassifier(n_neighbors=10)

In [38]:
knn_like.fit(X_train_like, y_train_like)

KNeighborsClassifier(n_neighbors=10)

In [39]:
knn_tfidf.fit(X_train_tfidf, y_train_tfidf)

KNeighborsClassifier(n_neighbors=10)

In [40]:
knn_full.fit(X_train_full, y_train_full)

KNeighborsClassifier(n_neighbors=10)

## Evaluate predictions

In [41]:
X_test_com = com_test_downsample[graph_features]
y_test_com = com_test_downsample[target]

In [42]:
X_test_like = like_test_downsampled[graph_features]
y_test_like = like_test_downsampled[target]

In [43]:
X_test_tfidf = tfidf_test_downsampled[graph_features + tfidf_features]
y_test_tfidf = tfidf_test_downsampled[target]

In [44]:
X_test_full = full_test_downsampled[full_graph_features]
y_test_full = full_test_downsampled[target]

### RF evaluation

In [45]:
predictions = rfc_com.predict(X_test_com)
evaluate_model(predictions, y_test_com)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.895402,0.007613,0.64993,0.01505


In [46]:
predictions = rfc_like.predict(X_test_like)
evaluate_model(predictions, y_test_like)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.862671,0.005529,0.425383,0.010916


In [47]:
predictions = rfc_tfidf.predict(X_test_tfidf)
evaluate_model(predictions, y_test_tfidf)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.963385,0.326087,0.84507,0.470588


In [48]:
predictions = rfc_full.predict(X_test_full)
evaluate_model(predictions, y_test_full)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.997732,0.053529,1.0,0.101619


### SVM evaluation

In [49]:
if not QUICK_RUN:
    predictions = svm_com.predict(X_test_com)
    evaluate_model(predictions, y_test_com)

In [50]:
if not QUICK_RUN:
    predictions = svm_like.predict(X_test_like)
    evaluate_model(predictions, y_test_like)

In [51]:
if not QUICK_RUN:
    predictions = svm_tfidf.predict(X_test_tfidf)
    evaluate_model(predictions, y_test_tfidf)

In [52]:
if not QUICK_RUN:
    predictions = svm_full.predict(X_test_full)
    evaluate_model(predictions, y_test_tfidf)

### KNN evaluation

In [53]:
predictions = knn_com.predict(X_test_com)
evaluate_model(predictions, y_test_com)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.880409,0.005134,0.499303,0.010163


In [54]:
predictions = knn_like.predict(X_test_like)
evaluate_model(predictions, y_test_like)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.845504,0.00348,0.300383,0.00688


In [55]:
predictions = knn_tfidf.predict(X_test_tfidf)
evaluate_model(predictions, y_test_tfidf)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.924329,0.178351,0.812207,0.292477


In [56]:
predictions = knn_full.predict(X_test_full)
evaluate_model(predictions, y_test_full)

Unnamed: 0,0,1,2,3
Measure,Accuracy,Precision,Recall,F1
Score,0.986773,0.00902,0.938053,0.017868


## Check Train set imbalance

### COM

In [57]:
(y_train_com == 0).sum()/y_train_com.shape[0]

0.6620503943065974

In [58]:
(y_test_com == 0).sum()/y_test_com.shape[0]

0.9987704243322221

### LIKE

In [59]:
(y_train_like == 0).sum()/y_train_like.shape[0]

0.6660618727176684

In [60]:
(y_test_like == 0).sum()/y_test_like.shape[0]

0.9982184733407261

### TFID

In [61]:
(y_train_tfidf == 0).sum()/y_train_tfidf.shape[0]

0.6665505833188229

In [62]:
(y_test_tfidf == 0).sum()/y_test_tfidf.shape[0]

0.9807431516137781

### FULL


In [63]:
(y_train_full == 0).sum()/y_train_full.shape[0]

0.656

In [64]:
(y_test_full == 0).sum()/y_test_full.shape[0]

0.9998717321494854