In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [44]:
from tensorflow_hub import load as tensorflow_hub_load
from services.text_embedding_service import TextEmbeddingService
from os import getenv

text_embedding_service = TextEmbeddingService(
    text_embedding_model=tensorflow_hub_load(
        getenv("UNIVERSAL_SENTENCE_ENCODER_PATH")
    )
)

2024-06-11 08:02:44.431943: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-11 08:02:44.482073: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-11 08:02:44.638023: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [62]:
from umap import UMAP
from services.dimensionality_reduction_service import DimensionalityReductionService

n_dimensions = 20
dimensionality_reduction_service = DimensionalityReductionService(
    UMAP(min_dist=0.25, metric="cosine", n_components=n_dimensions)
)

In [63]:
from sklearn.discriminant_analysis import StandardScaler
scaler = StandardScaler()

In [64]:
from typing import Iterable
from numpy import float64
import pandas as pd
from numpy.typing import NDArray

def add_text_embeddings(df: pd.DataFrame, column_key: str):
    embeddings = dimensionality_reduction_service.reduce_dimensions(
        text_embedding_service.embed_sentences(df[column_key]),
    )

    df.drop(columns=[column_key], inplace=True)
    for i in range(n_dimensions):
        df[f"dimension_{i}"] = embeddings.T[i]

def add_user_interactions_into_df(df:pd.DataFrame, interactions: Iterable[bool], column_name: str):
    df[column_name] = interactions

def scale(df: pd.DataFrame) -> NDArray[float64]:
    return scaler.fit_transform(df)



In [65]:
import pandas as pd

df = pd.read_csv("data/posts.csv")
df.head()

Unnamed: 0,author_name,author_title,author_company,post_text
0,Sarah Jones,Software Engineer,Google,Just spent an hour debugging a semicolon. 🤦‍♀️...
1,David Lee,Marketing Manager,Acme Corp,Excited to announce the launch of our new prod...
2,Emily Chen,Data Scientist,Amazon,Did you know that the average person spends 6 ...
3,Michael Rodriguez,CEO,Startup Inc.,Building a company is like building a house - ...
4,Jessica Williams,Professor,University of California,My students are brilliant! So impressed by the...


In [66]:
df_user_interactions = pd.read_csv("data/posts_interactions.csv")
df_user_interactions.head()

Unnamed: 0.1,Unnamed: 0,author_name,author_title,author_company,post_text,Interacted
0,0,Sarah Jones,Software Engineer,Google,Just spent an hour debugging a semicolon. 🤦‍♀️...,False
1,1,David Lee,Marketing Manager,Acme Corp,Excited to announce the launch of our new prod...,False
2,2,Emily Chen,Data Scientist,Amazon,Did you know that the average person spends 6 ...,False
3,3,Michael Rodriguez,CEO,Startup Inc.,Building a company is like building a house - ...,False
4,4,Jessica Williams,Professor,University of California,My students are brilliant! So impressed by the...,True


In [67]:
add_text_embeddings(df, "post_text")
df.head()

Unnamed: 0,author_name,author_title,author_company,dimension_0,dimension_1,dimension_2,dimension_3,dimension_4,dimension_5,dimension_6,...,dimension_10,dimension_11,dimension_12,dimension_13,dimension_14,dimension_15,dimension_16,dimension_17,dimension_18,dimension_19
0,Sarah Jones,Software Engineer,Google,10.168714,2.777392,3.747933,4.988751,6.038089,4.753548,4.202432,...,5.241844,5.148439,6.272185,4.627784,5.12077,6.406745,3.000002,5.693073,4.145835,5.266862
1,David Lee,Marketing Manager,Acme Corp,10.228584,2.408725,4.074613,5.097498,5.762329,5.217642,5.903861,...,4.230959,2.718131,6.722778,4.875412,4.373396,9.150442,3.929162,5.097658,4.933164,5.774004
2,Emily Chen,Data Scientist,Amazon,10.318771,2.406589,3.894036,4.988521,5.749007,5.098168,4.901739,...,4.808472,4.002949,6.286886,4.756927,4.777421,7.205676,3.524927,5.596407,4.517416,5.490468
3,Michael Rodriguez,CEO,Startup Inc.,11.158389,2.333982,2.062802,4.743351,5.121155,3.587501,1.172571,...,4.142912,8.141905,2.971933,6.938706,-1.122008,2.467715,1.716898,7.774052,0.974569,3.684078
4,Jessica Williams,Professor,University of California,10.203452,3.458994,4.845674,6.203872,6.533197,8.96515,8.059004,...,10.463759,7.76958,7.366114,1.651276,1.236165,4.601607,6.948806,5.223161,6.772198,5.85472


In [69]:
add_user_interactions_into_df(df, pd.Series(df_user_interactions["Interacted"]), "Interacted")
df.head()

Unnamed: 0,author_name,author_title,author_company,dimension_0,dimension_1,dimension_2,dimension_3,dimension_4,dimension_5,dimension_6,...,dimension_11,dimension_12,dimension_13,dimension_14,dimension_15,dimension_16,dimension_17,dimension_18,dimension_19,Interacted
0,Sarah Jones,Software Engineer,Google,10.168714,2.777392,3.747933,4.988751,6.038089,4.753548,4.202432,...,5.148439,6.272185,4.627784,5.12077,6.406745,3.000002,5.693073,4.145835,5.266862,False
1,David Lee,Marketing Manager,Acme Corp,10.228584,2.408725,4.074613,5.097498,5.762329,5.217642,5.903861,...,2.718131,6.722778,4.875412,4.373396,9.150442,3.929162,5.097658,4.933164,5.774004,False
2,Emily Chen,Data Scientist,Amazon,10.318771,2.406589,3.894036,4.988521,5.749007,5.098168,4.901739,...,4.002949,6.286886,4.756927,4.777421,7.205676,3.524927,5.596407,4.517416,5.490468,False
3,Michael Rodriguez,CEO,Startup Inc.,11.158389,2.333982,2.062802,4.743351,5.121155,3.587501,1.172571,...,8.141905,2.971933,6.938706,-1.122008,2.467715,1.716898,7.774052,0.974569,3.684078,False
4,Jessica Williams,Professor,University of California,10.203452,3.458994,4.845674,6.203872,6.533197,8.96515,8.059004,...,7.76958,7.366114,1.651276,1.236165,4.601607,6.948806,5.223161,6.772198,5.85472,True


In [71]:
df.drop(columns=["author_name", "author_title", "author_company"], inplace=True)

In [72]:
df.describe()

Unnamed: 0,dimension_0,dimension_1,dimension_2,dimension_3,dimension_4,dimension_5,dimension_6,dimension_7,dimension_8,dimension_9,dimension_10,dimension_11,dimension_12,dimension_13,dimension_14,dimension_15,dimension_16,dimension_17,dimension_18,dimension_19
count,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0,902.0
mean,9.633655,2.681563,4.61206,5.06059,5.553674,5.027977,4.823102,5.144323,5.062772,5.45499,4.556863,4.766572,5.874911,5.034471,4.71391,5.446944,3.846839,6.278185,4.841409,5.447987
std,2.367962,2.960712,1.669056,1.552986,2.908724,2.436634,2.648308,2.840439,2.652582,3.054288,1.933296,2.438545,2.297733,2.015631,2.024687,2.377186,2.133284,1.924883,1.731001,1.93418
min,-2.045631,-1.192314,0.638206,-4.178039,-6.733346,-0.617318,-4.762115,-2.782787,-0.738177,-1.871897,-3.015227,-0.579527,0.714077,1.114642,-1.166062,0.477867,0.746944,1.492681,-0.896926,0.392058
25%,9.400411,1.115475,3.557266,4.594366,5.172184,3.799267,3.492299,3.161323,3.064967,3.507892,3.370864,2.823171,4.377802,3.639019,3.714993,4.010773,2.491584,5.029892,4.151281,4.513307
50%,10.124517,2.07856,4.481174,5.044285,5.976433,4.958012,5.425535,4.143682,5.08268,5.245857,4.445907,4.877848,5.847049,4.872173,4.975296,4.925544,3.54951,6.160887,4.955115,5.426115
75%,10.63429,3.416221,5.459565,5.790348,6.681044,5.719811,6.770251,7.428535,6.509143,7.752022,5.260657,6.692025,7.16572,6.471233,5.746919,6.28135,4.527717,7.725635,5.866799,6.233244
max,12.682452,14.804668,12.206773,8.224972,10.181302,15.212307,9.340388,10.374914,11.747494,13.121087,10.73772,9.995883,11.337955,10.790162,10.777661,11.165182,10.628144,10.56319,10.199448,11.743885


In [73]:
from sklearn.model_selection import train_test_split
from numpy import float64


Y: "pd.Series[bool]" = df.pop("Interacted")
X = scale(df)
splitted_data: list[NDArray[float64]] = train_test_split(
    X,
    Y,
    test_size=0.3,
    random_state=26,
)
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = splitted_data

In [74]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(12, 12, 12, 12), max_iter=600, random_state=26)

In [75]:
model.fit(X_TRAIN, Y_TRAIN)

In [76]:
Y_TEST_PRED = model.predict(X_TEST)

In [77]:
from numpy import sqrt
from sklearn.metrics import confusion_matrix


conf_mat = confusion_matrix(Y_TEST, Y_TEST_PRED)

TN, FP, FN, TP = conf_mat.ravel()

accuracy = (TP + TN) / (TP + FP + TN + FN)
recall = TP / (TP + FN)
specificity = TN / (FP + TN)
precision = TP / (TP + FP)
mcc = ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
f1_score_ = (2 * precision * recall) / (precision + recall)

print('MODEL ASSESSMENT RF')
print('\nConfusion Matrix\n', conf_mat, end='\n\n')
print('            Accuracy : ', accuracy)
print('Recall / Sensitivity : ', recall)
print('         Specificity : ', specificity)
print('           Precision : ', precision)
print('                 MCC : ', mcc)
print('            F1-Score : ', f1_score_)

MODEL ASSESSMENT RF

Confusion Matrix
 [[164  19]
 [ 47  41]]

            Accuracy :  0.7564575645756457
Recall / Sensitivity :  0.4659090909090909
         Specificity :  0.8961748633879781
           Precision :  0.6833333333333333
                 MCC :  0.408375150598176
            F1-Score :  0.5540540540540541
