<a href="https://colab.research.google.com/github/2j0123/WASSUP_Project_Team7_2/blob/JS/Drug_Data_EDA_Preprocess_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Drug Dataset Preprocess

In [235]:
import pandas as pd

In [236]:
drug_data = pd.read_csv('/home/kdt-admin/miniconda3/envs/JS/WASSUP_Project_Team7_2/drugsComTrain_raw.csv')

## 전처리

### 결측치 제거

In [237]:
drug_data = drug_data.dropna()
drug_data.isna().sum()

uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64

### 전처리 함수

#### 숫자와 영어 알파벳을 제외한 문자를 공백으로 대체

In [238]:
import re

def extract_word(text):
    non_english = re.compile('[^a-zA-Z0-9]')
    return non_english.sub(' ', text)

#### 소문자로 변환

In [239]:
def fix_lower(text):
    fixed_text = text.lower() # 소문자로 변환
    return fixed_text

#### 불용어 제거

In [240]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt') #token
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/kdt-admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kdt-
[nltk_data]     admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [241]:
english_stopwords = set(stopwords.words('english'))
english_stopwords.discard('no') # no를 불용어에 포함시키지 않음
english_stopwords.discard('not') # not를 불용어에 포함시키지 않음

def del_stopwords(tokens):
    return [token for token in tokens if token not in english_stopwords and token.isalnum()]

#### 숫자와 수치형 단위 제거

In [242]:
def del_numericalunits(text):
    non_numericalunits = r'\b\d+(\.\d+)?\s?\S*' # 숫자와 단위 정규 표현식
    return re.sub(non_numericalunits, '', text)

#### Lemmatization


In [243]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def fix_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

[nltk_data] Downloading package wordnet to /home/kdt-
[nltk_data]     admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kdt-admin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### 토큰화

In [244]:
from nltk.tokenize import word_tokenize

def fix_tokenize(text):
    return word_tokenize(text)

#### 공백 수정 및 토큰 연결

In [245]:
def join_tokens(tokens):
    return ' '.join(tokens) # 공백으로 연결

#### 공백 제거

In [246]:
def del_space(text):
    return text.replace(" ", "")

#### html 코드 제거

In [247]:
# from bs4 import BeautifulSoup

def del_html(text):
    # return BeautifulSoup(text, 'html.parser').get_text() # </span>만 제거되고 문장은 제거되지 않음.
    if '</span> users found this comment helpful.' in text:
        return ''
    else:
        return text

#### apostrophe 제거

In [248]:
def remove_apostrophe(text):
    return text.replace("'", "")

#### Stemming

In [249]:
from nltk.stem import PorterStemmer, LancasterStemmer

porter_stemmer = PorterStemmer()

lancaster_stemmer = LancasterStemmer()

##### Porter Stemming 

In [250]:
def porter_stemming(review):
    return [porter_stemmer.stem(word) for word in review]

##### Lancaster stemming

In [251]:
def lancaster_stemming(review):
    return [lancaster_stemmer.stem(word) for word in review]

#### Padding

In [252]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

##### Padding 적용 함수

In [253]:
def convert_padding(df, column_name):
    tokenizer.fit_on_texts(df[column_name])
    sequences = tokenizer.texts_to_sequences(df[column_name])
    max_length = max(len(sentence) for sentence in df[column_name]) # 최대 문장 길이
    padded_review = pad_sequences(sequences, maxlen=max_length, padding='post') # Padding
    
    return padded_review.tolist()

#### Embedding

In [254]:
import numpy as np

embedding_index = {}
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [255]:
def convert_embedding(df, column_name):
    embedded_sequences = []
    
    for sequence in df[column_name]:
        embedded_sequence = []
        for word_index in sequence:
            if word_index != 0:  # 패딩된 값(0)인 경우 무시
                word = tokenizer.index_word[word_index]
                embedding_vector = embedding_index.get(word)
                if embedding_vector is not None:
                    embedded_sequence.append(embedding_vector)
        embedded_sequences.append(embedded_sequence)
    
    return embedded_sequences 

### 'review' 전처리

#### 'review' 전처리 통합 함수


In [256]:
def preprocess_review(text):
    fixed_text = del_html(text) # html 코드 제거
    fixed_text = remove_apostrophe(fixed_text) # apostrophe 제거
    fixed_text = extract_word(text) # 숫자와 영어 알파벳을 제외한 문자를 공백으로 대체
    fixed_text = fix_lower(fixed_text) # 소문자로 변환
    fixed_text = del_numericalunits(fixed_text) # 숫자와 수치형 단위 제거
    tokens = fix_tokenize(fixed_text) # 토큰화
    filtered_tokens = del_stopwords(tokens) # 불용어 제거
    lemmatized_tokens = fix_lemmatization(filtered_tokens) # Lemmatization
    fixed_text = join_tokens(lemmatized_tokens) # 공백 수정 및 토큰 연결
    return fixed_text.split()

#### 'review' 컬럼 전처리 통합 함수 적용


In [257]:
drug_data['fixed_review'] = drug_data['review'].apply(preprocess_review)

In [258]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[no, side, effect, take, combination, bystolic..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[son, halfway, fourth, week, intuniv, became, ..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[used, take, another, oral, contraceptive, cyc..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[first, time, using, form, birth, control, gla..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[suboxone, completely, turned, life, around, f..."


#### 'review' 컬럼 Stemming 적용

In [259]:
#drug_data['fixed_review'] = drug_data['fixed_review'].apply(lancaster_stemming)
drug_data['fixed_review'] = drug_data['fixed_review'].apply(porter_stemming)

In [260]:
print("stemming result length : ", len(set(word for review in drug_data['fixed_review'] for word in review)))

stemming result length :  32625


#### 'review' 컬럼 Padding 적용

In [261]:
drug_data['fixed_review'] = convert_padding(drug_data, 'fixed_review')

In [262]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[6, 10, 4, 1, 476, 2135, 2430, 1145, 0, 0, 0, ..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[474, 2482, 1289, 17, 2368, 284, 579, 241, 46,..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[16, 1, 166, 874, 954, 348, 153, 190, 18, 1405..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[13, 12, 16, 454, 67, 43, 450, 48, 257, 13, 34..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[713, 128, 363, 33, 131, 11, 1750, 866, 324, 1..."


#### 'review' 컬럼 Embedding 적용

In [263]:
drug_data['fixed_review'] = convert_embedding(drug_data, 'fixed_review')

In [264]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2..."


### 'condition' 전처리

#### 'condition' 전처리 통합 함수

In [265]:
def preprocess_condition(text):
    fixed_text = del_html(text) # html 코드 제거
    #fixed_text = del_space(fixed_text) # 공백 제거
    fixed_text = remove_apostrophe(fixed_text) # apostrophe 제거
    fixed_text = extract_word(fixed_text) # 숫자와 영어 알파벳을 제외한 문자를 공백으로 대체
    fixed_text = fix_lower(fixed_text) # 소문자로 변환
    tokens = fix_tokenize(fixed_text) # 토큰화
    fixed_text = join_tokens(tokens) # 공백 수정 및 토큰 연결
    return fixed_text.split()

#### 'condition' 전처리 통합 함수 적용

In [266]:
drug_data['fixed_condition'] = drug_data['condition'].apply(preprocess_condition)

In [267]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[left, ventricular, dysfunction]"
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...",[adhd]
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[birth, control]"
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[birth, control]"
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[opiate, dependence]"


#### 'condition' 컬럼 Stemming 적용

In [268]:
drug_data['fixed_condition'] = drug_data['fixed_condition'].apply(porter_stemming)

In [269]:
print("stemming result length : ", len(set(word for review in drug_data['fixed_condition'] for word in review)))

stemming result length :  975


#### 'condition' 컬럼 Padding 적용

In [270]:
drug_data['fixed_condition'] = convert_padding(drug_data, 'fixed_condition')

In [271]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[311, 3662, 661, 0, 0, 0, 0, 0]"
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[249, 0, 0, 0, 0, 0, 0, 0]"
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[14, 12, 0, 0, 0, 0, 0, 0]"
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[14, 12, 0, 0, 0, 0, 0, 0]"
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[492, 415, 0, 0, 0, 0, 0, 0]"


#### 'condition' 컬럼 Embedding 적용

In [272]:
drug_data['fixed_condition'] = convert_embedding(drug_data, 'fixed_condition')

In [273]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.038089, -0.02724, -0.063579, -0.46381, 0.1..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[[-0.22986, 0.2479, 0.035986, 0.78691, 0.64195..."
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[[-0.077826, 0.30547, 0.52047, -0.24198, 0.060..."


### 'fixed_condition', 'fixed_review' 컬럼이 모두 비어 있는 행 제거

#### fixed_condition'와 'fixed_review' 컬럼이 모두 비어 있는 리스트 행 개수

In [274]:
empty_both_rows = (drug_data['fixed_condition'].apply(len) == 0) & (drug_data['fixed_review'].apply(len) == 0)
empty_both_count = empty_both_rows.sum()

print("Counts of 'fixed_condition', 'fixed_review' is Empty :", empty_both_count)

Counts of 'fixed_condition', 'fixed_review' is Empty : 34


#### fixed_condition'와 'fixed_review' 컬럼이 모두 비어 있는 리스트 행 제거

In [275]:
print("Before Delete Dataset Size :", drug_data.shape)

# 'fixed_condition'와 'fixed_review' 컬럼이 모두 비어 있는 행 제거
drug_data = drug_data[~((drug_data['fixed_condition'].apply(len) == 0) & (drug_data['fixed_review'].apply(len) == 0))]

# 제거 후 데이터셋 정보 확인
print("After Delete Dataset Size  :", drug_data.shape)

Before Delete Dataset Size : (160398, 9)
After Delete Dataset Size  : (160364, 9)


#### 'condition', 'drugName' 이상치 처리

In [276]:
drug_data = drug_data[~drug_data['condition'].str.match('\d{2}-[a-zA-Z]{3}-\d{2}', na=False)]
drug_data = drug_data[~drug_data['drugName'].str.match('\d+', na=False)]

  drug_data = drug_data[~drug_data['condition'].str.match('\d{2}-[a-zA-Z]{3}-\d{2}', na=False)]
  drug_data = drug_data[~drug_data['drugName'].str.match('\d+', na=False)]


### 'drugName' 인코딩

#### 'drugName' Label Encoding

In [289]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

drug_data['encoded_drugName'] = label_encoder.fit_transform(drug_data['drugName'])

print(drug_data[['drugName', 'encoded_drugName']])

                        drugName  encoded_drugName
0                      Valsartan              3197
1                     Guanfacine              1428
2                         Lybrel              1850
3                     Ortho Evra              2279
4       Buprenorphine / naloxone               515
...                          ...               ...
161292                   Campral               554
161293            Metoclopramide              1970
161294                   Orencia              2273
161295        Thyroid desiccated              3022
161296              Lubiprostone              1836

[160364 rows x 2 columns]


#### 'drugName' Label Encoding MAX number

In [295]:
print("MAX Number :", drug_data['encoded_drugName'].max())

MAX Number : 3429


In [290]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition,sentiment,fixed_date,encoded_drugName
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.038089, -0.02724, -0.063579, -0.46381, 0.1...",1,2012-05-20,3197
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[[-0.22986, 0.2479, 0.035986, 0.78691, 0.64195...",1,2010-04-27,1428
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",2,2009-12-14,1850
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",1,2015-11-03,2279
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[[-0.077826, 0.30547, 0.52047, -0.24198, 0.060...",1,2016-11-27,515


### 파생변수 'sentiment' 생성

#### rating(평점)이 8 이상이면 1(긍정), 5이상 7이하이면 2(보통), 1이상 4이하이면 3(부정)
// describe()를 근거로 하지만 75% 가 10이기 때문에 위배될 가능성

In [277]:
drug_data['sentiment'] = drug_data['rating'].apply(lambda x: 1 if x >= 8 else (2 if x >= 5 else 3))

In [278]:
drug_data.head(10)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition,sentiment
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.038089, -0.02724, -0.063579, -0.46381, 0.1...",1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[[-0.22986, 0.2479, 0.035986, 0.78691, 0.64195...",1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",2
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[[-0.077826, 0.30547, 0.52047, -0.24198, 0.060...",1
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,"[[-0.24501, -0.12491, -0.2536, 0.33819, -0.105...","[[-0.641, -0.80264, 0.39142, -0.59499, 0.34665...",3
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,7-Mar-17,5,"[[0.30355, -0.059372, -0.2152, -0.12084, -0.09...",[],3
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10,14-Mar-15,32,"[[0.017519, -0.26919, 0.30792, 0.079229, 0.110...","[[-0.56538, -0.016191, 0.16879, -0.10513, 0.58...",1
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1,9-Aug-16,11,"[[0.015157, -0.37755, -0.40063, 0.081775, -0.1...",[],3
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,8-Dec-16,1,"[[-0.59305, 0.62782, 0.21467, 0.67512, 0.63928...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",1


### 'fixed_date' 생성

#### 'date' 컬럼 'datetime' 형식으로 변환

In [279]:
drug_data['fixed_date'] = pd.to_datetime(drug_data['date'])

  drug_data['fixed_date'] = pd.to_datetime(drug_data['date'])


In [280]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition,sentiment,fixed_date
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.038089, -0.02724, -0.063579, -0.46381, 0.1...",1,2012-05-20
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[[-0.22986, 0.2479, 0.035986, 0.78691, 0.64195...",1,2010-04-27
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",2,2009-12-14
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",1,2015-11-03
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[[-0.077826, 0.30547, 0.52047, -0.24198, 0.060...",1,2016-11-27


## 모델링에 불필요한 컬럼 제거 후 새로운 데이터 프레임 생성

In [296]:
drug_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,fixed_review,fixed_condition,sentiment,fixed_date,encoded_drugName
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,"[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.038089, -0.02724, -0.063579, -0.46381, 0.1...",1,2012-05-20,3197
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,"[[-0.13661, -0.21326, 0.20525, -0.54117, -0.13...","[[-0.22986, 0.2479, 0.035986, 0.78691, 0.64195...",1,2010-04-27,1428
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,"[[-0.10515, 0.13407, 0.13839, -0.647, 0.1243, ...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",2,2009-12-14,1850
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,"[[-0.15561, 0.50069, -0.23022, -0.33005, -0.23...","[[-0.11292, -0.33504, -0.24223, 0.2406, -0.285...",1,2015-11-03,2279
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,"[[-0.053251, 0.1133, -0.031874, -0.23255, -0.2...","[[-0.077826, 0.30547, 0.52047, -0.24198, 0.060...",1,2016-11-27,515


### 전처리가 끝난 데이터셋 백업

In [297]:
drug_data_after_preprocess = drug_data

In [299]:
drug_data_after_preprocess.to_csv('drug_data_after_preprocess.csv', index=True)

KeyboardInterrupt: 

### 모델링을 위한 불필요한 컬럼 제거

In [None]:
drug_data_for_modeling = drug_data.drop(columns=['uniqueID', 'drugName', 'conditioni', 'review', 'rating', 'date', 'usefulCount', 'sentiment', 'fixed_date'])

### 모델링을 위한 불필요한 컬럼이 제거된 데이터셋 백업

In [None]:
drug_data_for_modeling.to_csv('drug_data_for_modeling.csv', index=True)