In [282]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from trnlp import TrnlpWord

In [283]:
df=pd.read_csv("./data/policlinic_dataset.csv")
df.head()

Unnamed: 0,Policlinic,Message
0,Nöroloji,Başım ağrıyor
1,Nöroloji,Başımda basınç hissi
2,Nöroloji,Başım dönüyor
3,Nöroloji,Unutkanlık yaşıyorum
4,Nöroloji,Kulak çınlaması


In [284]:
df["Policlinic"].value_counts()

Policlinic
Ortopedi              110
KBB                   109
Göz Hastalıkları      103
Kardiyoloji           102
Psikiyatri            102
Dahiliye               99
Nöroloji               97
Dermatoloji            97
Onkoloji               96
Diş Hekimliği          93
Üroloji                92
Göğüs Hastalıkları     90
Name: count, dtype: int64

In [285]:
stop_words = [
    "a", "able", "acaba", "alt", "altında", "ama", "ancak", "artık", "aslında", "bazen", "bazı", "biri", "birkaç", 
    "birçok", "birtakım", "biz", "bize", "bizim", "büyük", "böyle", "de", "değil", "diğer", "dolayı", "dönem", 
    "dönemi", "e", "fakat", "gibi","galiba","", "hepsi", "hepsini", "her", "herhangi", "hiç", "hiçbir", "için", "ilgili", 
    "ise", "kadar", "kendi", "kendine", "kimi", "ne", "neden", "nedenle", "nerede", "niçin", "o", "olarak", 
    "onun", "sonra", "şey", "şimdi", "ta", "tıpkı", "ve", "veya", "ya", "yani", "yerine", "zaten", "üzere", "ama", 
    "başka", "bayağı", "belli", "ben", "beni", "benden", "benim", "bir", "birbirini", "biri", "birkaç", "birçok", 
    "bu", "bunu", "bundan", "bunun", "ciddi", "çok", "çokça", "da", "daha", "dahil", "de", "değil", "diğer", "geri", 
    "gibi", "hadi", "hangi", "hani", "hem", "hep", "hepsi", "her", "hiç", "hiçbir", "için", "ilgili", "ise", "ki", 
    "kendine", "kendisini", "kim", "kimi", "niçin", "ne", "neden", "ne zaman", "nerede", "olarak", "onun", "şey", 
    "şimdi", "sonra", "tabii", "tüm", "ve", "veya", "ya", "yani", "yerine", "zaten","mesela","mı","mi","mu","mü",".","?"
]


In [286]:
X=df["Message"]
y = df['Policlinic']

In [287]:
nlp = TrnlpWord()

def lemmatize(word):
    nlp.setword(word)
    
    if True:
        last_word = str(nlp).split("(")[0]
        return last_word
    return None 

for sentence in range(0,len(X)):
    words=X[sentence].split(" ")

    transformed_words = [lemmatize(word) for word in words]
    X[sentence]=" ".join(transformed_words)
X       

0                        baş ağrı
1                baş basınç hissî
2                         baş dön
3                       unut yaşa
4                       kulak çın
                  ...            
1185            cilt sürek kız ol
1186            ayak tırnak  oldu
1187    cilt tahriş nedeniyle kız
1188               el kabarcık ol
1189         cilt beyaz kabuk art
Name: Message, Length: 1190, dtype: object

In [288]:
vectorizer = CountVectorizer(stop_words=stop_words)
x_vec = vectorizer.fit_transform(X)



In [289]:
X_train, X_test, y_train, y_test = train_test_split(x_vec, y, test_size=0.3, random_state=42)

In [290]:
def evaluateModel(classifier,X_test):
    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [291]:
lof=LocalOutlierFactor(n_neighbors=40)
outliers=lof.fit_predict(X_train)
mask=outliers!=-1

X_train,y_train=X_train[mask,:],y_train[mask]

In [292]:
print(len(vectorizer.get_feature_names_out()))

458


In [293]:
rfc = RandomForestClassifier(n_estimators=120, random_state=42)
rfc.fit(X_train, y_train)

In [294]:
accuracy=evaluateModel(rfc,X_test)
print(f"Accuracy with Random Forest Classifier {accuracy}")

Accuracy with Random Forest Classifier 0.7675070028011205


In [295]:
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,y_train)

In [296]:
accuracy=evaluateModel(knn,X_test)
print(f"Accuracy with KNN classifier {accuracy}")

Accuracy with KNN classifier 0.6358543417366946


In [297]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

accuracy=evaluateModel(lr,X_test)
print(f"Accuracy with Logistic Regression {accuracy}")


Accuracy with Logistic Regression 0.7703081232492998


In [298]:
def askUser(user_input,model):
    words=user_input.split(" ")
    transformed_words = [lemmatize(word) for word in words]
    user_input=" ".join(transformed_words)

    user_input_vec = vectorizer.transform([user_input])
    user_pred = model.predict(user_input_vec)
    return user_pred

In [299]:
user_input = input("Lütfen Şikayetinizi girin: ")

Lütfen Şikayetinizi girin:  omzum çıktı


In [300]:
user_pred=askUser(user_input,rfc)
print(f"Predicted Policlinic with Random Forest Classifier: {user_pred[0]}")

Predicted Policlinic with Random Forest Classifier: Psikiyatri


In [301]:
user_pred=askUser(user_input,knn)
print(f"Predicted Policlinic with KNN: {user_pred[0]}")

Predicted Policlinic with KNN: Ortopedi


In [302]:
user_pred=askUser(user_input,lr)
print(f"Predicted Policlinic with Logistic Regression: {user_pred[0]}")

Predicted Policlinic with Logistic Regression: Dermatoloji
