In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from node2vec import Node2Vec

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing, feature_extraction
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.manifold import TSNE


In [2]:
G = nx.read_gml('datasets/verified.gml')

In [3]:
G.number_of_nodes(), G.number_of_edges()

(2901, 148998)

In [4]:
df = pd.read_csv("datasets/verified_features_all_300k.csv")


In [5]:
df.head()

Unnamed: 0,uid,name,friends_count,followers_count,listed_count,statuses_count,fake,desc_subjectivity,desc_polarity,tweets_subjectivity,...,weapon_empath,children_empath,monster_empath,ocean_empath,giving_empath,contentment_empath,writing_empath,rural_empath,positive_emotion_empath,musical_empath
0,1172796973,PTorresTV,548,1117,25,12136,0,0.0,0.0,0.31253,...,0.002391,0.021099,0.001657,0.00192,0.003702,0.000585,0.002335,0.00246,0.007839,0.011049
1,1070586199,EOnlineUK,353,102733,316,60989,0,0.0,0.0,0.301738,...,0.002308,0.021205,0.002619,0.002023,0.005247,0.000557,0.002802,0.002584,0.009525,0.013514
2,25589776,people,1751,7565880,33588,267090,1,0.0,0.0,0.324708,...,0.002769,0.032744,0.000474,0.001301,0.002765,0.000771,0.004536,0.005398,0.009638,0.008561
3,1367531,FoxNews,398,18416786,65345,418052,0,0.0,0.0,0.214538,...,0.002891,0.004289,0.001988,0.001589,0.00192,0.000191,0.003285,0.004726,0.002932,0.018821
4,20012204,usweekly,1177,2181224,11975,188328,1,0.9,0.5,0.264836,...,0.00075,0.035524,0.000372,0.002707,0.002477,0.00055,0.005693,0.001944,0.007702,0.008623


In [6]:
names = list(df['name'].values)

In [7]:
sub_g = G.subgraph(names)

In [8]:
sub_g.number_of_nodes(), sub_g.number_of_edges()

(2900, 148998)

In [9]:
nodes = list(sub_g.nodes())

df = df[df['name'].isin(nodes)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2900 entries, 0 to 2901
Columns: 205 entries, uid to musical_empath
dtypes: float64(198), int64(6), object(1)
memory usage: 4.6+ MB


In [None]:
node2vec = Node2Vec(sub_g, dimensions=128, walk_length=40, num_walks=200, workers=3, p=.5, q=3)


Computing transition probabilities: 100%|██████████| 2900/2900 [01:44<00:00, 26.07it/s]


In [None]:
vmodel = node2vec.fit()
vocab = list(vmodel.wv.vocab)
X_v = vmodel[vocab]

In [None]:
target = []

for x in vocab:
    target.append(int(df.loc[df['name'] == x]['fake'])  )

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_v)

In [None]:
dfg = pd.DataFrame(X_tsne, columns=['x', 'y'])


In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
color= ['red' if l == 1 else 'blue' for l in target]
ax.scatter(dfg['x'], dfg['y'], color=color)

# for word, pos in dfg.iterrows():
#     ax.annotate(word, pos)
    
plt.show()