In [1]:
import pandas as pd
import numpy as np

In [2]:
test = pd.read_csv('drugLibTest_raw.tsv', sep='\t') # Test verilerinin okunması
train = pd.read_csv('drugLibTrain_raw.tsv', sep='\t') # Eğitim verilerinin okunması

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,1366,biaxin,9,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...
1,3724,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...
2,3824,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...
3,969,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...
4,696,accutane,10,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...


In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [5]:
df = test.append(train, ignore_index=True, sort=False) # Test ve Eğitim verilerinin birleştirilmesi

In [6]:
df = df.drop(['Unnamed: 0'], axis = 1) # Index sütununun kaldırılması

In [7]:
df['text'] = df.select_dtypes(exclude=['int64']).apply(lambda row: ' '.join(row.values.astype(str)), axis=1) # Tüm metinlerin birleştirilmesi

In [8]:
df.rating = df.rating.apply(lambda x: 0 if x < 6 else 1) # Rating verilerinin yeniden etiketlenmesi

In [9]:
df.head()

Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview,text
0,biaxin,1,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...,biaxin Considerably Effective Mild Side Effect...
1,lamictal,1,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...,lamictal Highly Effective Mild Side Effects bi...
2,depakene,0,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...,depakene Moderately Effective Severe Side Effe...
3,sarafem,1,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...,sarafem Highly Effective No Side Effects bi-po...
4,accutane,1,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...,accutane Highly Effective Mild Side Effects no...


In [10]:
df.rating.value_counts() # Pozitif ve Negatif verilerin dağılımı

1    3015
0    1128
Name: rating, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4143 entries, 0 to 4142
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   urlDrugName        4143 non-null   object
 1   rating             4143 non-null   int64 
 2   effectiveness      4143 non-null   object
 3   sideEffects        4143 non-null   object
 4   condition          4142 non-null   object
 5   benefitsReview     4143 non-null   object
 6   sideEffectsReview  4141 non-null   object
 7   commentsReview     4135 non-null   object
 8   text               4143 non-null   object
dtypes: int64(1), object(8)
memory usage: 291.4+ KB


In [12]:
from nltk.corpus import stopwords
from textblob import Word

sw=stopwords.words("english") # İngilizce stopwords kütüphanesinin okunması

In [13]:
def data_cleaning(column):
    column = column.str.lower() # Tüm metinler küçük harf formatına dönüştürülür
    column = column.str.replace("[^\w\s]", "", regex=True) # Noktalama işaretlerinin silinmesi
    column = column.str.replace("\d", "", regex=True) # Sayıların silinmesi
    column = column.apply(lambda x: " ".join(x for x in x.split() if x not in sw)) # Stopwords kelimelerin silinmesi
    column = column.apply(lambda x: " ".join(Word(i).lemmatize() for i in x.split())) # Lemmatization
    return column

In [14]:
df.text = data_cleaning(df.text) # Veri ön işleme sürecinin yapılması

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1,2)) # TF-IDF modelinin kurulumu
X = vectorizer.fit_transform(df.text.tolist()) # TF-IDF modeli kullanılarak metinlerden vektör elde edilmesi

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,df.rating, random_state=1, test_size=0.20) # Eğitim-Test bölünmesi

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=1)
model_lr = lr.fit(X_train, y_train)
prediction_lr = model_lr.predict(X_test)
print("Accuracy Score on traning data: ", lr.score(X_train, y_train))
print("Accuracy Score on test data: ", lr.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_lr))
print(classification_report(y_test, prediction_lr))
print("Accuracy Score: ", accuracy_score(y_test, prediction_lr))

Accuracy Score on traning data:  0.8931804465902233
Accuracy Score on test data:  0.8130277442702051
[[ 74 152]
 [  3 600]]
              precision    recall  f1-score   support

           0       0.96      0.33      0.49       226
           1       0.80      1.00      0.89       603

    accuracy                           0.81       829
   macro avg       0.88      0.66      0.69       829
weighted avg       0.84      0.81      0.78       829

Accuracy Score:  0.8130277442702051


In [19]:
from sklearn.linear_model import Perceptron
pr = Perceptron(random_state=1)
model_pr = pr.fit(X_train, y_train)
prediction_pr = model_pr.predict(X_test)
print("Accuracy Score on traning data: ", pr.score(X_train, y_train))
print("Accuracy Score on test data: ", pr.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_pr))
print(classification_report(y_test, prediction_pr))
print("Accuracy Score: ", accuracy_score(y_test, prediction_pr))

Accuracy Score on traning data:  0.9990947495473748
Accuracy Score on test data:  0.9034981905910736
[[179  47]
 [ 33 570]]
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       226
           1       0.92      0.95      0.93       603

    accuracy                           0.90       829
   macro avg       0.88      0.87      0.88       829
weighted avg       0.90      0.90      0.90       829

Accuracy Score:  0.9034981905910736


In [20]:
from sklearn.linear_model import RidgeClassifier
rc = RidgeClassifier(random_state=1)
model_rc = rc.fit(X_train, y_train)
prediction_rc = model_rc.predict(X_test)
print("Accuracy Score on traning data: ", rc.score(X_train, y_train))
print("Accuracy Score on test data: ", rc.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_rc))
print(classification_report(y_test, prediction_rc))
print("Accuracy Score: ", accuracy_score(y_test, prediction_rc))

Accuracy Score on traning data:  0.9981894990947495
Accuracy Score on test data:  0.8854041013268998
[[141  85]
 [ 10 593]]
              precision    recall  f1-score   support

           0       0.93      0.62      0.75       226
           1       0.87      0.98      0.93       603

    accuracy                           0.89       829
   macro avg       0.90      0.80      0.84       829
weighted avg       0.89      0.89      0.88       829

Accuracy Score:  0.8854041013268998


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=1)
model_gbc = gbc.fit(X_train, y_train)
prediction_gbc = model_gbc.predict(X_test)
print("Accuracy Score on traning data: ", gbc.score(X_train, y_train))
print("Accuracy Score on test data: ", gbc.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_gbc))
print(classification_report(y_test, prediction_gbc))
print("Accuracy Score: ", accuracy_score(y_test, prediction_gbc))

Accuracy Score on traning data:  0.9333132166566084
Accuracy Score on test data:  0.9288299155609168
[[187  39]
 [ 20 583]]
              precision    recall  f1-score   support

           0       0.90      0.83      0.86       226
           1       0.94      0.97      0.95       603

    accuracy                           0.93       829
   macro avg       0.92      0.90      0.91       829
weighted avg       0.93      0.93      0.93       829

Accuracy Score:  0.9288299155609168


In [22]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=1)
model_ada = ada.fit(X_train, y_train)
prediction_ada = model_ada.predict(X_test)
print("Accuracy Score on traning data: ", ada.score(X_train, y_train))
print("Accuracy Score on test data: ", ada.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_ada))
print(classification_report(y_test, prediction_ada))
print("Accuracy Score: ", accuracy_score(y_test, prediction_ada))

Accuracy Score on traning data:  0.9197344598672299
Accuracy Score on test data:  0.9119420989143546
[[180  46]
 [ 27 576]]
              precision    recall  f1-score   support

           0       0.87      0.80      0.83       226
           1       0.93      0.96      0.94       603

    accuracy                           0.91       829
   macro avg       0.90      0.88      0.89       829
weighted avg       0.91      0.91      0.91       829

Accuracy Score:  0.9119420989143546


In [23]:
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier(random_state=1)
model_bc = bc.fit(X_train, y_train)
prediction_bc = model_bc.predict(X_test)
print("Accuracy Score on traning data: ", bc.score(X_train, y_train))
print("Accuracy Score on test data: ", bc.score(X_test, y_test))
print(confusion_matrix(y_test, prediction_bc))
print(classification_report(y_test, prediction_bc))
print("Accuracy Score: ", accuracy_score(y_test, prediction_bc))

Accuracy Score on traning data:  0.9882317441158721
Accuracy Score on test data:  0.9252110977080821
[[190  36]
 [ 26 577]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       226
           1       0.94      0.96      0.95       603

    accuracy                           0.93       829
   macro avg       0.91      0.90      0.90       829
weighted avg       0.92      0.93      0.92       829

Accuracy Score:  0.9252110977080821
