<a href="https://colab.research.google.com/github/B21-CAP0133/verify-android-app/blob/ML/ML-dir/03_VERIFY_Tuning_Hyperparameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF IDF

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('verify_dataset_clean.csv')
df.sample(5)

Unnamed: 0,Judul,Label
1530,pt dirgantara indonesia tak jual pihak asing,0
761,banserpdip damai,0
2230,modus habis ustadz suntik covid19 mati,1
209,foto pdtdrirniko njotorahardjo dukung paslon p...,1
687,kpk bantah terima tiket gratis asi games erick...,0


In [None]:
x = df['Judul']
y = df['Label']

In [None]:
tvec_pipe = Pipeline([
                  ('tfidf', TfidfVectorizer()),
                  ('logreg', LogisticRegression())
])
tvec_params = {
    'tfidf__ngram_range' : ((1,1), (1,2), (2,2)),
    'tfidf__max_df' : (0.5, 0.75, 1.0),
    'tfidf__max_features' : (None, 5000, 10000, 20000),
    'tfidf__norm' : ('l1', 'l2'),
    'tfidf__use_idf' : (True, False)
}

tvec_gs = GridSearchCV(tvec_pipe, tvec_params, cv = 5, verbose = 1, n_jobs = -1)

In [None]:
tvec_gs.fit(x, y)

print(tvec_gs.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed:   19.8s


{'tfidf__max_df': 0.5, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__use_idf': True}


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   42.8s finished


In [None]:
tvec_gs.best_score_

0.7704677032319069

## Mengaplikasikan hyperparameter TF-IDF terbaik

In [None]:
tfidf = TfidfVectorizer(
    ngram_range = (1,2),
    max_df = 0.5,
    use_idf = True,
    max_features = None,
    norm = 'l2'
    )

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [None]:
print(len(x_test))
x_test_tfidf.shape

237


(237, 15553)

# Logistic Regression

In [None]:
model = LogisticRegression()
model_params = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-1,4,10),
    'solver' : ['liblinear', 'saga', 'sag', 'lbfgs'],
    'max_iter' : [5000, 10000]
}

logreg_gs = GridSearchCV(model, model_params, cv = 5, verbose = 1, n_jobs = -1)

In [None]:
logreg_gs.fit(x_train_tfidf, y_train)

print(logreg_gs.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 436 tasks      | elapsed: 111.0min
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed: 180.6min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 204.1min finished


{'C': 774.2636826811278, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}


In [None]:
logreg_gs.best_score_

0.7792267329466998