In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier

In [2]:
# data berlabel manual
df_labeled = pd.read_csv("label_manual.csv", sep=";")

# data tanpa label
df_unlabeled = pd.read_csv("unlabeled_data.csv")

In [3]:
df_labeled.head()

Unnamed: 0,normalisasi,label
0,luar biasa,positive
1,aplikasi nya bagus bisa serba guna tapi buat u...,neutral
2,good,positive
3,sangat membantu,positive
4,karena tidak sesuai dengan gambar,negative


In [4]:
df_unlabeled.head()

Unnamed: 0,normalisasi
0,bagus banget
1,keren
2,mantap
3,namun
4,bagus bet


In [5]:

df_unlabeled["label"] = -1

df_all = pd.concat([df_labeled, df_unlabeled], ignore_index=True)

# Data dan Label
texts = df_all["normalisasi"].astype(str).values
labels = df_all["label"].values

In [6]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(texts)

In [17]:
base_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

self_training_model = SelfTrainingClassifier(
    base_model,
    threshold=0.6,
    max_iter=50,
    verbose=True
)

# Train model
self_training_model.fit(X, labels)

End of iteration 1, added 231 new labels.
End of iteration 2, added 115 new labels.
End of iteration 3, added 79 new labels.
End of iteration 4, added 59 new labels.
End of iteration 5, added 53 new labels.
End of iteration 6, added 34 new labels.
End of iteration 7, added 26 new labels.
End of iteration 8, added 17 new labels.
End of iteration 9, added 22 new labels.
End of iteration 10, added 14 new labels.
End of iteration 11, added 7 new labels.
End of iteration 12, added 5 new labels.
End of iteration 13, added 3 new labels.
End of iteration 14, added 3 new labels.
End of iteration 15, added 1 new labels.
End of iteration 16, added 1 new labels.


In [18]:
final_labels = self_training_model.transduction_

df_all["final_label"] = final_labels


In [20]:
probs = self_training_model.predict_proba(X)
df_all["confidence"] = probs.max(axis=1)

df_all[ df_all["confidence"] > 0.5 ].head()


Unnamed: 0,normalisasi,label,final_label,confidence
0,luar biasa,positive,positive,0.574747
1,aplikasi nya bagus bisa serba guna tapi buat u...,neutral,neutral,0.776761
3,sangat membantu,positive,positive,0.995171
4,karena tidak sesuai dengan gambar,negative,negative,0.7949
5,kok,neutral,neutral,0.890358


In [21]:
df_all.to_csv("hasil_self_training.csv", index=False)