In [1]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

sample_text = "One of the first things that we ask ourselves is what are the pros and cons of any task we perform."
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]

print("불용어 제거 미적용:", text_tokens, '\n')
print("불용어 제거 적용:", tokens_without_sw)

불용어 제거 미적용: ['One', 'of', 'the', 'first', 'things', 'that', 'we', 'ask', 'ourselves', 'is', 'what', 'are', 'the', 'pros', 'and', 'cons', 'of', 'any', 'task', 'we', 'perform', '.'] 

불용어 제거 적용: ['One', 'first', 'things', 'ask', 'pros', 'cons', 'task', 'perform', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\516-29\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\516-29\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

print(stemmer.stem('obesses'), stemmer.stem('obssesed'))
print(stemmer.stem('standardizes'), stemmer.stem('standardization'))
print(stemmer.stem('national'), stemmer.stem('nation'))
print(stemmer.stem('absentness'), stemmer.stem('absently'))
print(stemmer.stem('tribalical'), stemmer.stem('tribalicalized'))

obess obsses
standard standard
nation nation
absent absent
tribal tribalic


In [3]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('obsesses'), stemmer.stem('obsessed'))
print(stemmer.stem('standardizes'), stemmer.stem('standardization'))
print(stemmer.stem('national'), stemmer.stem('nation'))
print(stemmer.stem('absentness'), stemmer.stem('absently'))
print(stemmer.stem('tribalical'), stemmer.stem('tribalicalized')) 

obsess obsess
standard standard
nat nat
abs abs
trib trib


In [4]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 
lemma = WordNetLemmatizer()

print(stemmer.stem('obsesses'), stemmer.stem('obsessed'))
print(lemma.lemmatize('standardizes'), lemma.lemmatize('standardization'))
print(lemma.lemmatize('national'), lemma.lemmatize('nation'))
print(lemma.lemmatize('absentness'), lemma.lemmatize('absently'))
print(lemma.lemmatize('tribalical'), lemma.lemmatize('tribalicalized'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\516-29\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


obsess obsess
standardizes standardization
national nation
absentness absently
tribalical tribalicalized


In [5]:
print(lemma.lemmatize('obsesses','v'), lemma.lemmatize('obsessed','a'))
print(lemma.lemmatize('standardizes','v'), lemma.lemmatize('standardization','n'))
print(lemma.lemmatize('national','a'), lemma.lemmatize('nation','n'))
print(lemma.lemmatize('absentness','n'), lemma.lemmatize('absently','r'))
print(lemma.lemmatize('tribalical','a'), lemma.lemmatize('tribalicalized','v'))

obsess obsessed
standardize standardization
national nation
absentness absently
tribalical tribalicalized


In [6]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
df = pd.read_csv('pytorch/chap09/data/diabetes.csv')
X = df[df.columns[:-1]] 
y = df['Outcome'] 

X = X.values
y = torch.tensor(y.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [9]:
ms = MinMaxScaler() 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_train = ms.fit_transform(y_train)
y_test = ms.fit_transform(y_test)

In [10]:
class customdataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(self.X)
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    def __len__(self):
        return self.len

In [11]:
train_data = customdataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_data = customdataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)

In [12]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        self.layer_1 = nn.Linear(8, 64, bias=True) 
        self.layer_2 = nn.Linear(64, 64, bias=True)
        self.layer_out = nn.Linear(64, 1, bias=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)

    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

In [14]:
epochs = 1000+1
print_epoch = 100
LEARNING_RATE = 1e-2

model = binaryClassification()
model.to(device)
print(model)
BCE = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

binaryClassification(
  (layer_1): Linear(in_features=8, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [15]:
def accuracy(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_test).sum().float() 
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100) 
    return acc

In [16]:
import torch

model.to(device)

for epoch in range(epochs):
    iteration_loss = 0. 
    iteration_accuracy = 0.

    model.train() 
    for i, data in enumerate(train_loader): 
        X, y = data
        X, y = X.to(device), y.to(device)
        y_pred = model(X.float()) 
        loss = BCE(y_pred, y.reshape(-1,1).float()) 

        iteration_loss += loss
        iteration_accuracy += accuracy(y_pred, y) 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if(epoch % print_epoch == 0):
        print('Train: epoch: {0} - loss: {1:.5f}; acc: {2:.3f}'.format(epoch, iteration_loss/(i+1), iteration_accuracy/(i+1)))

    iteration_loss = 0.
    iteration_accuracy = 0.
    model.eval() 
    for i, data in enumerate(test_loader):
        X, y = data
        X, y = X.to(device), y.to(device)
        y_pred = model(X.float())
        loss = BCE(y_pred, y.reshape(-1,1).float())
        iteration_loss += loss
        iteration_accuracy += accuracy(y_pred, y)
    if(epoch % print_epoch == 0):
        print('Test: epoch: {0} - loss: {1:.5f}; acc: {2:.3f}'.format(epoch,
              iteration_loss/(i+1), iteration_accuracy/(i+1)))

Train: epoch: 0 - loss: 0.70530; acc: 51.889
Test: epoch: 0 - loss: 0.68490; acc: 59.000
Train: epoch: 100 - loss: 0.39776; acc: 83.000
Test: epoch: 100 - loss: 0.45322; acc: 79.250
Train: epoch: 200 - loss: 0.43270; acc: 79.778
Test: epoch: 200 - loss: 0.45932; acc: 78.750
Train: epoch: 300 - loss: 0.35074; acc: 85.333
Test: epoch: 300 - loss: 0.49598; acc: 74.000
Train: epoch: 400 - loss: 0.41834; acc: 80.000
Test: epoch: 400 - loss: 0.47651; acc: 77.750
Train: epoch: 500 - loss: 0.50862; acc: 76.333
Test: epoch: 500 - loss: 0.52593; acc: 75.000
Train: epoch: 600 - loss: 0.36181; acc: 84.333
Test: epoch: 600 - loss: 0.53759; acc: 73.500
Train: epoch: 700 - loss: 0.41056; acc: 79.667
Test: epoch: 700 - loss: 0.50311; acc: 78.500
Train: epoch: 800 - loss: 0.51765; acc: 71.667
Test: epoch: 800 - loss: 0.44979; acc: 77.250
Train: epoch: 900 - loss: 0.41354; acc: 83.222
Test: epoch: 900 - loss: 0.43214; acc: 77.750
Train: epoch: 1000 - loss: 0.41983; acc: 80.222
Test: epoch: 1000 - loss: 