In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
df = pd.read_excel('/content/All_coding_anonimized data scriptie.xlsx')

In [None]:
df['Text'].replace('', np.nan, inplace=True)

df.dropna(subset=['Kankersoort'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Text'].replace('', np.nan, inplace=True)


In [None]:
y = df['Kankersoort']

In [None]:
X = df['Text']

In [None]:
vectorizer = TfidfVectorizer(max_features=1500)
X = vectorizer.fit_transform(df['Text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.60      0.70        53
         1.0       0.91      0.94      0.92        71
         2.0       0.80      0.95      0.87        78
         3.0       1.00      0.87      0.93        31

    accuracy                           0.86       233
   macro avg       0.88      0.84      0.85       233
weighted avg       0.86      0.86      0.85       233



In [None]:
feature_names = vectorizer.get_feature_names_out()
importances = model.coef_[0]
indices = np.argsort(importances)[-50:]
print('Top key features for general or unclear class:')
for i in indices:
  print(f"{feature_names[i]}: {importances[i]}")

Top key features for general or unclear class:
centrum: 0.40446510867459007
oud: 0.40726954285433675
roukema: 0.4113276042424861
ligt: 0.41905063011955096
maand: 0.42770772872759194
terwijl: 0.43135533440356877
ouderen: 0.43341489879223094
buiten: 0.43623221378920657
belang: 0.43764975692397784
elk: 0.43913102006955146
voordeel: 0.4471019954461475
dacht: 0.4532106733448595
nadelen: 0.4548167645354371
oproep: 0.4584110363279789
buurvrouw: 0.45903131745348325
chirurg: 0.460638133993635
oost: 0.46278560212326053
altijd: 0.46417170991897383
100: 0.468804581106262
post: 0.47645359425227846
steeds: 0.47970393493500985
achter: 0.4809427355255275
zo: 0.49374410387041856
en: 0.49771465935940806
actie: 0.5039711197262061
gewoon: 0.5089098227938305
ggd: 0.5121074541564384
overheid: 0.5206292203231132
1938: 0.5242122847689157
hoe: 0.5277853706988492
komen: 0.5406397233627733
vind: 0.5424840203016623
deze: 0.5494660666539116
brief: 0.5499359737576555
zelftest: 0.5519348550420814
zoveel: 0.559124541

In [None]:
feature_names = vectorizer.get_feature_names_out()
importances = model.coef_[1]
indices = np.argsort(importances)[-50:]
print('Top key features for cervical cancer class:')
for i in indices:
  print(f"{feature_names[i]}: {importances[i]}")

Top key features for cervical cancer class:
te: 0.366059681260071
60: 0.3723627505904846
hand: 0.3730852674837822
ook: 0.3749069026087219
zelf: 0.37626571182237506
weet: 0.3818222828743786
wij: 0.38544325683647745
halen: 0.3955201794152743
was: 0.40541083876470724
meer: 0.40904461062895525
vriendin: 0.4120540440009905
dames: 0.41273173106840216
zin: 0.4166154092909473
mis: 0.4199848970149435
gynaecoloog: 0.42223313268907314
wat: 0.42664298239256715
tegenwoordig: 0.4408217768554765
had: 0.4502586057084844
brief: 0.4502970269876205
baarmoederhals: 0.46017618797507354
binnenkort: 0.46398366145538644
doen: 0.47700466333593244
nieuwe: 0.4790896970711076
laten: 0.49340749057800354
oproep: 0.4951423641286385
kom: 0.4970599661223393
klachten: 0.5192970520169284
deelname: 0.523592309640541
gezellig: 0.525604331259574
mij: 0.5309545815253309
volgende: 0.5367862856409075
is: 0.5394480144489635
afwijkend: 0.5728733505343538
bijna: 0.5745150373688472
administratie: 0.5874414579008558
maken: 0.60151

In [None]:
feature_names = vectorizer.get_feature_names_out()
importances = model.coef_[2]
indices = np.argsort(importances)[-50:]
print('Top key features for breast cancer class:')
for i in indices:
  print(f"{feature_names[i]}: {importances[i]}")

Top key features for breast cancer class:
plat: 0.38545678328717686
joy: 0.3855529500971033
pletten: 0.39460464728003436
iets: 0.3952203572653547
gelukkig: 0.39811026610801853
kunnen: 0.4000037661938025
verschil: 0.40953093883370634
claustro: 0.41202820855153965
pletmachine: 0.41202820855153965
getsie: 0.41202820855153965
hele: 0.4147301548184703
in: 0.42934911585605107
enkel: 0.4297862631066628
deed: 0.4319006647534066
worden: 0.43566034244966156
mijn: 0.4417924920413244
tettenpletter: 0.45570492783879574
foto: 0.4621042874627245
mama: 0.4636460610001784
straks: 0.4714144614912999
vrouw: 0.4735528025895967
lief: 0.4738349728475789
truck: 0.4861635176537159
au: 0.49241105658820533
gehad: 0.5035755533002485
vrouwen: 0.5112112952679458
geplet: 0.5345506649078955
mri: 0.5437171668080799
de: 0.5509310826446203
ga: 0.5535614182527886
zou: 0.5568955766598492
pijn: 0.5579362618428638
50: 0.5687891422778221
spannend: 0.577617358294255
mammo: 0.5832984129613838
fijn: 0.5885942513887726
hoop: 0.

In [None]:
feature_names = vectorizer.get_feature_names_out()
importances = model.coef_[3]
indices = np.argsort(importances)[-50:]
print('Top key features for colorectal cancer class:')
for i in indices:
  print(f"{feature_names[i]}: {importances[i]}")

Top key features for colorectal cancer class:
rt: 0.3489748468532891
info: 0.35081247104793856
nederlanders: 0.35601513603615365
zaterdag: 0.36315341440626503
bloedsporen: 0.36651792602159283
waar: 0.3689850080378222
000: 0.3697442647047845
aan: 0.3718939751373478
het: 0.3784914944351983
één: 0.3878121143073254
55: 0.389090963276201
reden: 0.3905640420194177
test: 0.4061619206234984
coloscopie: 0.4189084629421836
twitterspreekuur: 0.4192815244302785
onder: 0.42601140286143513
preventie: 0.42760215513658323
ptdarmkanker: 0.4347791557780786
twee: 0.43557276729091343
willen: 0.4396656653133324
2014: 0.4551951311833781
uitgevoerd: 0.4564751772211188
check: 0.45912740682390757
patiënten: 0.4630318423724707
tijdvoormax: 0.4673284204581295
1000: 0.4750665275493258
keuze: 0.48410557461339915
onderwerpen: 0.4963195310902784
ddkscreening: 0.5055401733125617
buisje: 0.5072526698289679
in: 0.5182980722983197
benieuwd: 0.5268700411844274
stront: 0.5392469289746561
vanaf: 0.5397567862329364
per: 0.5