In [12]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
df = pd.read_excel('/content/All_coding_anonimized data scriptie.xlsx')

In [3]:
df['Text'].replace('', np.nan, inplace=True)
df.dropna(subset=['Topic'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Text'].replace('', np.nan, inplace=True)


In [4]:
def binary_topic(topic):
  if topic == 5:
    return 1
  else:
    return 0

In [5]:
y = df['Topic'].apply(binary_topic)

In [6]:
X = df['Text']

In [7]:
vectorizer = TfidfVectorizer(max_features=1500)
X = vectorizer.fit_transform(df['Text'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = LogisticRegression()

In [10]:
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.43      0.56        67
           1       0.81      0.96      0.88       166

    accuracy                           0.81       233
   macro avg       0.81      0.70      0.72       233
weighted avg       0.81      0.81      0.79       233



In [14]:
feature_names = vectorizer.get_feature_names_out()
importances_personal = model.coef_[0]
indices_personal = np.argsort(importances_personal)[-50:]
print('Top key features for personal class:')
for i in indices_personal:
    print(feature_names[i], importances_personal[i])

Top key features for personal class:
binnen 0.62587215463933
dan 0.6282145871073584
doe 0.6296162228612595
gelukkig 0.6363008109142867
me 0.6480800193915124
had 0.6518189760573533
auw 0.651981225143435
blij 0.6538912389760595
keer 0.6618427088480775
goede 0.6623419615024524
negatief 0.6778581821560269
maar 0.6886800037554912
gevonden 0.6913366632556152
er 0.693396370163153
altijd 0.696614884010844
nu 0.6984537783225945
na 0.7037429261231323
was 0.7099648351278437
echo 0.7241522700708508
weken 0.726168509358288
pijn 0.7450436952325636
mat 0.7633823736393118
brief 0.7762474349570323
uitnodiging 0.790872396463593
mijn 0.8028968574385603
ziekenhuis 0.8055037292871294
naar 0.8057105097544517
huisarts 0.8227713791126152
belangrijk 0.8245882616852687
vandaag 0.8355290700976905
dat 0.841951759132166
geen 0.8503181943028063
achter 0.866184066029065
nog 0.8864794923378548
ben 0.8902828237156062
echt 0.9173534111061459
spannend 0.94074447774458
leuk 0.9443468243106802
voor 0.9638286478336311
toch

In [15]:
importances_nonpersonal = -model.coef_[0]
indices_nonpersonal = np.argsort(importances_nonpersonal)[-50:]
print('Top key features for non-personal class:')
for i in indices_nonpersonal:
    print(feature_names[i], importances_nonpersonal[i])

Top key features for non-personal class:
mri 0.5801221492655495
vragen 0.5823096451397238
onder 0.5837347380281835
kankerscreening 0.5861871843704552
kosten 0.5871630771301197
eigenlijk 0.5882990121070343
noemen 0.5886932840177055
veel 0.5978332542898198
komen 0.6004867815108227
ontlastingstest 0.605292593540843
minder 0.6053114141221034
darm 0.6053386156802437
steeds 0.6068073988755458
nederland 0.61343332288796
landelijk 0.6168988445982677
wist 0.6301936590592233
kom 0.6342611161252978
umc 0.6384716992039888
wij 0.6576930846614124
verjaardag 0.6717403428255029
mannen 0.6720939133033728
weinig 0.68592520261816
maand 0.6911520568120982
2013 0.6929926681015699
worden 0.7008088402831668
nlingelicht 0.7031591819741774
fit 0.7031591819741774
mevrouw 0.7123587162079368
dit 0.7132511828384885
aan 0.7189671378405063
vanaf 0.7290981135317629
geld 0.7593767016441992
info 0.7781456213120838
deelname 0.7991339382995544
volg 0.8193789461102312
nadelen 0.819503243631887
in 0.8228274328866892
kanker