<a href="https://colab.research.google.com/github/2j0123/WASSUP_Project_Team7_2/blob/JS/Drug_Data_EDA_Preprocess_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Drug Dataset Preprocess

In [2]:
import pandas as pd

In [3]:
drug_data = pd.read_csv('/home/kdt-admin/miniconda3/envs/JS/WASSUP_Project_Team7_2/drugsComTrain_raw.csv')

## 전처리

### 결측치 제거

In [4]:
drug_data = drug_data.dropna()
drug_data.isna().sum()

uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64

### 중복 행 검사

In [None]:
drug_data.duplicated().sum()

0

### 'review' 컬럼 전처리

#### 숫자와 영어 알파벳을 제외한 문자를 공백으로 대체

In [5]:
import re

def extract_word(text):
    non_english = re.compile('[^a-zA-Z0-9]')
    return non_english.sub(' ', text)

#### 소문자로 변환

In [6]:
def fix_lower(text):
    fixed_text = text.lower() # 소문자로 변환
    return fixed_text

#### 불용어 제거

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt') #token
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
english_stopwords = set(stopwords.words('english'))
english_stopwords.discard('no') # no를 불용어에 포함시키지 않음
english_stopwords.discard('not') # not를 불용어에 포함시키지 않음

def del_stopwords(tokens):
    return [token for token in tokens if token not in english_stopwords and token.isalnum()]

#### 숫자와 수치형 단위 제거

In [9]:
def del_numericalunits(text):
    non_numericalunits = r'\b\d+(\.\d+)?\s?\S*' # 숫자와 단위 정규 표현식
    return re.sub(non_numericalunits, '', text)

#### Lemmatization


In [10]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def fix_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### 토큰화

In [11]:
from nltk.tokenize import word_tokenize

def fix_tokenize(text):
    return word_tokenize(text)

#### 공백 수정 및 토큰 연결

In [12]:
def join_tokens(tokens):
    return ' '.join(tokens) # 공백으로 연결

#### 'review' 전처리 통합 함수

In [13]:
def preprocess_review(text):
    fixed_text = extract_word(text) # 숫자와 영어 알파벳을 제외한 문자를 공백으로 대체
    fixed_text = fix_lower(fixed_text) # 소문자로 변환
    fixed_text = del_numericalunits(fixed_text) # 숫자와 수치형 단위 제거
    tokens = fix_tokenize(fixed_text) # 토큰화
    filtered_tokens = del_stopwords(tokens) # 불용어 제거
    lemmatized_tokens = fix_lemmatization(filtered_tokens) # Lemmatization
    fixed_text = join_tokens(lemmatized_tokens) # 공백 수정 및 토큰 연결
    return fixed_text.split()

#### 'review' 컬럼에 전처리 통합 함수 적용

In [14]:
drug_data['fixed_review'] = drug_data['review'].apply(preprocess_review)

In [15]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[no, side, effect, take, combination, bystolic..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[son, halfway, fourth, week, intuniv, became, ..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[used, take, another, oral, contraceptive, cyc..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[first, time, using, form, birth, control, gla..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[suboxone, completely, turned, life, around, f..."


In [16]:
for i in range(10):
    print(i+1, drug_data['fixed_review'][i])

1 ['no', 'side', 'effect', 'take', 'combination', 'bystolic', 'fish', 'oil']
2 ['son', 'halfway', 'fourth', 'week', 'intuniv', 'became', 'concerned', 'began', 'last', 'week', 'started', 'taking', 'highest', 'dose', 'two', 'day', 'could', 'hardly', 'get', 'bed', 'cranky', 'slept', 'nearly', 'drive', 'home', 'school', 'vacation', 'unusual', 'called', 'doctor', 'monday', 'morning', 'said', 'stick', 'day', 'see', 'school', 'getting', 'morning', 'last', 'two', 'day', 'problem', 'free', 'much', 'agreeable', 'ever', 'le', 'emotional', 'good', 'thing', 'le', 'cranky', 'remembering', 'thing', 'overall', 'behavior', 'better', 'tried', 'many', 'different', 'medication', 'far', 'effective']
3 ['used', 'take', 'another', 'oral', 'contraceptive', 'cycle', 'happy', 'light', 'period', 'max', 'no', 'side', 'effect', 'contained', 'hormone', 'gestodene', 'not', 'available', 'u', 'switched', 'lybrel', 'ingredient', 'similar', 'pill', 'ended', 'started', 'lybrel', 'immediately', 'first', 'day', 'period', '

#### 단어의 빈도 수

In [17]:
from collections import Counter

all_tokens = [token for tokens in drug_data['fixed_review'] for token in tokens]
word_counts = Counter(all_tokens)
word_counts = word_counts.most_common()

word_encoding = {word: i+1 for i, (word, _) in enumerate(word_counts)}

print("전체   : ", len(word_encoding))

less_three = sum(1 for _, count in word_counts if count <= 3)
print("3 이하 : ", less_three)

전체   :  42173
3 이하 :  23716


In [18]:
word_counts[:10]

[('not', 87734),
 ('day', 74296),
 ('no', 59775),
 ('effect', 55980),
 ('side', 53551),
 ('taking', 50937),
 ('pain', 48178),
 ('time', 47233),
 ('take', 46878),
 ('first', 44729)]

In [19]:
dict(list(word_encoding.items())[:10])

{'not': 1,
 'day': 2,
 'no': 3,
 'effect': 4,
 'side': 5,
 'taking': 6,
 'pain': 7,
 'time': 8,
 'take': 9,
 'first': 10}

#### Padding

In [20]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(drug_data['fixed_review'])
sequences = tokenizer.texts_to_sequences(drug_data['fixed_review'])

max_length = max(len(sentence) for sentence in drug_data['fixed_review']) # 최대 문장 길이

padded_review = pad_sequences(sequences, maxlen=max_length, padding='post') # Padding

# 결과 확인
print(padded_review)

drug_data['padded_review'] = padded_review.tolist()

[[   3    5    4 ...    0    0    0]
 [ 474 2871 1372 ...    0    0    0]
 [  69    9  163 ...    0    0    0]
 ...
 [1863  389  428 ...    0    0    0]
 [ 912   23  893 ...    0    0    0]
 [ 314  353 1142 ...    0    0    0]]


In [22]:
for i in range(10):
    print(i+1, drug_data['padded_review'][i])

1 [3, 5, 4, 9, 639, 2463, 3293, 1205, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### 벡터화

In [None]:
#

## 파생변수 'sentiment' 생성

### rating(평점)이 8 이상이면 1(긍정), 5이상 7이하이면 2(보통), 1이상 4이하이면 3(부정)
// describe()를 근거로 하지만 75% 가 10이기 때문에 위배될 가능성

In [23]:
drug_data['sentiment'] = drug_data['rating'].apply(lambda x: 1 if x >= 8 else (2 if x >= 5 else 3))

In [24]:
drug_data.head(10)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,padded_review,sentiment
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[no, side, effect, take, combination, bystolic...","[3, 5, 4, 9, 639, 2463, 3293, 1205, 0, 0, 0, 0...",1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[son, halfway, fourth, week, intuniv, became, ...","[474, 2871, 1372, 15, 2763, 274, 1007, 241, 57...",1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[used, take, another, oral, contraceptive, cyc...","[69, 9, 163, 972, 1351, 346, 154, 184, 16, 159...",2
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[first, time, using, form, birth, control, gla...","[10, 8, 97, 507, 60, 43, 445, 40, 247, 10, 545...",1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[suboxone, completely, turned, life, around, f...","[721, 160, 617, 30, 124, 19, 1964, 14980, 308,...",1
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,"[day, started, work, rock, hard, erection, how...","[2, 12, 18, 1916, 237, 891, 111, 14981, 70, 29...",3
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,7-Mar-17,5,"[pulled, cummed, bit, took, plan, b, later, to...","[1558, 13131, 183, 27, 298, 394, 138, 27, 306,...",3
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10,14-Mar-15,32,"[abilify, changed, life, hope, zoloft, clonidi...","[794, 212, 30, 188, 331, 1951, 10, 12, 794, 43...",1
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1,9-Aug-16,11,"[nothing, problem, keppera, constant, shaking,...","[87, 71, 13132, 303, 1200, 352, 80, 251, 80, 2...",3
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,8-Dec-16,1,"[pill, many, year, doctor, changed, rx, chatea...","[17, 120, 20, 24, 212, 1438, 3150, 197, 36, 44...",1


## 파생변수 'fix_date' 생성

### fix_date와 rating의 상관관계

In [25]:
drug_data['fixed_date'] = pd.to_datetime(drug_data['date'])
rating_date_data = drug_data[['rating', 'fixed_date']]
rating_date_data['year_month'] = rating_date_data['fixed_date'].dt.to_period('M')
rating_date_data['year_month_numeric'] = rating_date_data['fixed_date'].dt.year * 12 + rating_date_data['fixed_date'].dt.month
mean_rating_by_month = rating_date_data.groupby('year_month_numeric')['rating'].mean()
correlation = rating_date_data['year_month_numeric'].corr(rating_date_data['rating'])

print("상관계수:", correlation)

  drug_data['fixed_date'] = pd.to_datetime(drug_data['date'])


상관계수: -0.19671610862182823


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_date_data['year_month'] = rating_date_data['fixed_date'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_date_data['year_month_numeric'] = rating_date_data['fixed_date'].dt.year * 12 + rating_date_data['fixed_date'].dt.month


In [26]:
print("상관계수:", correlation)

상관계수: -0.19671610862182823


In [27]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,padded_review,sentiment,fixed_date
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[no, side, effect, take, combination, bystolic...","[3, 5, 4, 9, 639, 2463, 3293, 1205, 0, 0, 0, 0...",1,2012-05-20
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[son, halfway, fourth, week, intuniv, became, ...","[474, 2871, 1372, 15, 2763, 274, 1007, 241, 57...",1,2010-04-27
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[used, take, another, oral, contraceptive, cyc...","[69, 9, 163, 972, 1351, 346, 154, 184, 16, 159...",2,2009-12-14
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[first, time, using, form, birth, control, gla...","[10, 8, 97, 507, 60, 43, 445, 40, 247, 10, 545...",1,2015-11-03
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[suboxone, completely, turned, life, around, f...","[721, 160, 617, 30, 124, 19, 1964, 14980, 308,...",1,2016-11-27


## 모델링에 불필요한 컬럼 제거 후 새로운 데이터 프레임 생성

In [286]:
#AP_drug_data = drug_data.drop(columns=['uniqueID', 'review', 'date'])