Create manual normalization feature

In [11]:
def norm_feature(data):
    n_max = max(data)
    n_min = min(data)
    len_of_data = len(data)
    for i in range(len_of_data):
        data[i] = (data[i] - n_min) / (n_max - n_min)
    return data

In [12]:
data = [
    10, 25, 15, 30, 50
]

print(norm_feature(data))

[0.0, 0.375, 0.125, 0.5, 1.0]


Normalisasi with SKLearn

In [16]:
import numpy 
from sklearn.preprocessing import MinMaxScaler

data = [ 
    [100, 0.001],
    [8, 0.05],
    [50, 0.005],
    [88, 0.07],
    [4, 0.1],
]

print(data)

print(numpy.asarray(data))

scaler = MinMaxScaler()

scaled = scaler.fit_transform(numpy.asarray(data))
print(scaled)

[[100, 0.001], [8, 0.05], [50, 0.005], [88, 0.07], [4, 0.1]]
[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


Manual Standardization

In [29]:
from statistics import mean, pstdev


def std_feature(data):
    kolom_data = data.shape[1]
    baris_data = data.shape[0]

    for i in range(0, baris_data):
        for j in range(0,kolom_data):
            mean_data = mean(data[:,j])
            std_data = pstdev(data[:,j])
            data[i][j] = (data[i][j] - mean_data) / std_data 
            
    return data


In [55]:
data = [
    [100, 0.001],
    [8, 0.05],
    [50, 0.005],
    [88, 0.07],
    [4, 0.1],
]

data = numpy.asarray(data)

print(std_feature(data))

[[ 1.26398112 -1.16389967]
 [-0.65623116  0.48622176]
 [ 0.6102111   0.18921441]
 [ 1.99807462  0.23476602]
 [ 1.65186847  0.22506085]]


Standardisasi with sklearn

In [56]:
from numpy import asarray
from sklearn.preprocessing import StandardScaler

data = [
    [100, 0.001],
    [8, 0.05],
    [50, 0.005],
    [88, 0.07],
    [4, 0.1],
]

print(data)

scaler = StandardScaler()

scaled = scaler.fit_transform(data)
print(scaled)

[[100, 0.001], [8, 0.05], [50, 0.005], [88, 0.07], [4, 0.1]]
[[ 1.26398112 -1.16389967]
 [-1.06174414  0.12639634]
 [ 0.         -1.05856939]
 [ 0.96062565  0.65304778]
 [-1.16286263  1.44302493]]


Feature Extraction dari Data Kategorik

In [57]:
from sklearn.preprocessing import OrdinalEncoder

data = [
    ['POLINEMA'],
    ['PENS'], 
    ['PNJ'], 
    ['PNP'],
    ['POLBAN']
]

encoder = OrdinalEncoder()
encoded = encoder.fit_transform(data)
print(encoded)

[[4.]
 [0.]
 [1.]
 [2.]
 [3.]]


One Hot Encoder

In [58]:
from sklearn.preprocessing import OneHotEncoder

data = [
    ['POLINEMA'],
    ['PENS'],
    ['PNJ'],
    ['PNP'],
    ['POLBAN']
]

encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(data)
print(encoded)

[[0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]]


Feature Extraction pada Data Text

In [5]:

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'the house had a tiny little mouse',
    'the cat saw the mouse', 
    'the mouse ran away from the house',
    'the cat finally ate the mouse', 
    'the end of the mouse story'
]

vectorizer = TfidfVectorizer(stop_words ='english')
data = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
# print(data.todense())
data.todense()

['ate' 'away' 'cat' 'end' 'finally' 'house' 'little' 'mouse' 'ran' 'saw'
 'story' 'tiny']


matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.4755751 , 0.58946308, 0.28088232, 0.        , 0.        ,
         0.        , 0.58946308],
        [0.        , 0.        , 0.58873218, 0.        , 0.        ,
         0.        , 0.        , 0.34771471, 0.        , 0.72971837,
         0.        , 0.        ],
        [0.        , 0.58946308, 0.        , 0.        , 0.        ,
         0.4755751 , 0.        , 0.28088232, 0.58946308, 0.        ,
         0.        , 0.        ],
        [0.58946308, 0.        , 0.4755751 , 0.        , 0.58946308,
         0.        , 0.        , 0.28088232, 0.        , 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.67009179, 0.        ,
         0.        , 0.        , 0.31930233, 0.        , 0.        ,
         0.67009179, 0.        ]])

In [32]:


# Import packages
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy
import pandas as pd

# Open file
file = open('orange.txt', 'r')
 
# read the file
paragraph = file.read()

# remove useless punctuation or marks
sentences = paragraph.replace(":",'')

# split the sentence to array
splitted_sentences = sentences.split('.')

# declare vectorizer with stop words in english
vectorizer = TfidfVectorizer(stop_words ='english')

# do feature extraxtion
data = vectorizer.fit_transform(splitted_sentences)

# get stemmed words
arr = numpy.asarray(vectorizer.get_feature_names_out())

# save the result to file named output.csv
pd.DataFrame(data.todense()).to_csv('output.csv', sep=',', header=arr)