In [1]:
import math
import pandas as pd
import networkx as nx

from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
Messages = pd.read_csv("Messages.csv",dtype= {"source": str, "target": str, "Text topic 1": float, 
                                         "Text topic 2": float, "Text topic 3": float})
Names = pd.read_csv("Names.csv")

In [3]:
Messages = Messages.dropna(axis = 0, how = 'all')
Messages = Messages.astype(int, errors='ignore')

In [4]:
node_to_name = dict(zip(Names['Node #'], Names['Name']))
# Create a new column 'source_name' and 'target_name' in Messages DataFrame
Messages['source_name'] = Messages['source'].map(node_to_name).fillna(Messages['source'])
Messages['target_name'] = Messages['target'].map(node_to_name).fillna(Messages['target'])

In [5]:
def transform_values(row):
    sus = [7,11,13]  
    if row['Text topic 1'] in sus and row['Text topic 2'] in sus and row['Text topic 3'] in sus:
        value_combined_1 = 1
    elif row['Text topic 1'] in sus and row['Text topic 2'] not in sus and row['Text topic 3'] in sus:
        value_combined_1 = 1
    elif row['Text topic 1'] in sus and row['Text topic 2'] in sus and row['Text topic 3'] not in sus:
        value_combined_1 = 1
    elif row['Text topic 1'] not in sus and row['Text topic 2'] in sus and row['Text topic 3'] in sus:
        value_combined_1 = 1 
    elif row['Text topic 1'] not in sus and row['Text topic 2'] not in sus and row['Text topic 3'] in sus:
        value_combined_1 = 1
    elif row['Text topic 1'] in sus and row['Text topic 2'] not in sus and row['Text topic 3'] not in sus:
        value_combined_1 = 1
    elif row['Text topic 1'] not in sus and row['Text topic 2'] in sus and row['Text topic 3'] not in sus:
        value_combined_1 = 1 
    else:
        value_combined_1 = 0
    
    return value_combined_1

In [6]:
Messages['Combined'] = Messages.apply(transform_values, axis=1)

In [7]:
X = Messages[['Text topic 1', 'Text topic 2', 'Text topic 3']]
Y = Messages['Combined']

In [8]:
X = X.fillna(0)

In [9]:
count_ones = Messages['Combined'].sum()
count_zeros = len(Messages) - count_ones
print("Ones: ", count_ones)
print("Zeros: ", count_zeros)

Ones:  97
Zeros:  303


In [10]:
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, Y)

## Logistic

In [11]:
log_clf = LogisticRegression(random_state=0).fit(X_resampled, y_resampled)
pred = log_clf.predict(X)
prob = log_clf.predict_proba(X)

In [12]:
accuracy = accuracy_score(Y, pred)
print("Accuracy:", accuracy)

Accuracy: 0.675


In [13]:
# Display coefficients and intercept
print("Coefficients:", log_clf.coef_)
print("Intercept:", log_clf.intercept_)

Coefficients: [[0.21140468 0.0878154  0.12038467]]
Intercept: [-1.969912]


## RFC

In [14]:
r_clf = RandomForestClassifier(random_state=42, max_depth=7).fit(X_resampled, y_resampled)
pred_r = r_clf.predict(X)
prob_r = r_clf.predict_proba(X)
accuracy = accuracy_score(Y, pred_r)
print("Accuracy:", accuracy)

Accuracy: 1.0


## Adding as Weights

In [15]:
Probabilty_of_being_sus = prob_r[:, 1]
if 'log_weights' in Messages.columns:
    Messages.drop('log_weights', axis=1, inplace=True)
    Messages['log_weights'] = Probabilty_of_being_sus
else:
    Messages['log_weights'] = Probabilty_of_being_sus
Messages['log_weights'] = Messages['log_weights'].replace(0, 0.01)

In [16]:
Messages

Unnamed: 0,source,target,Text topic 1,Text topic 2,Text topic 3,source_name,target_name,Combined,log_weights
0,19,24,4.0,,,Kristine,Franklin,0,0.010000
1,78,3,12.0,,,Este,Sherri,0,0.408692
2,43,6,10.0,,,Paul,Patrick,0,0.124054
3,45,29,15.0,,,Lois,Wayne,0,0.010000
4,26,8,6.0,,,Marian,Hazel,0,0.010000
...,...,...,...,...,...,...,...,...,...
395,42,23,15.0,,,Katherine,Wesley,0,0.010000
396,54,21,7.0,11.0,13.0,Ulf,Alex,1,0.994615
397,68,0,1.0,,,Ellin,Chris,0,0.010000
398,0,2,14.0,,,Chris,Paige,0,0.010000


In [17]:
Messages.to_csv('output_file.csv', index=False)

In [18]:
G = nx.MultiDiGraph()
# Add nodes and edges from DataFrame
for _, row in Messages.iterrows():
    source = row['source_name']
    target = row['target_name']
    # Extract weights from the DataFrame
    weight = row['log_weights']
    # Add edge with weights
    G.add_edge(source, target, weight=weight)

In [19]:
nx.write_gexf(G, "graph_file.gexf")

In [20]:
# Create a graph
#G = nx.DiGraph()

# Add nodes and edges from DataFrame
#for _, row in Messages.iterrows():
#    source = row['source']
#    target = row['target']
 #   G.add_edge(source, target, weight1=row['Text topic 1'], weight2=row['Text topic 2'], weight3=row['Text topic 3'])