In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import random

In [2]:
df = pd.read_csv('train.csv')

In [3]:
from sklearn.model_selection import train_test_split

not_features = ['id', 'sentimen_berita']
date_cols = ['tanggal']
cat_cols = [col for col in df.columns if df[col].nunique() < 52]
text_cols = [col for col in df.columns if col not in not_features + date_cols + cat_cols]

high_card = [col for col in cat_cols if df[col].nunique() > 5]
low_card = [col for col in cat_cols if df[col].nunique() <= 5 and col not in not_features]

X_train = df[cat_cols + text_cols]
y_train = df['sentimen_berita'].map({'Negatif':0, 'Netral':1, 'Positif':2})

In [4]:
print(low_card)

['sumber', 'sentimen_kutipan']


In [5]:
X_train.shape

(52889, 16)

In [6]:
#make model
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn import set_config
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler


set_config(display='diagram')

oh_encoder = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

high_card_preprocessor = Pipeline(steps=[
        ("ordinal_encoder", ordinal_encoder),])

low_card_preprocessor = Pipeline(steps=[
        ("oh_encoder", oh_encoder),])

preprocessor = ColumnTransformer([
        ("high_cardinality", high_card_preprocessor, high_card),
        ("low_cardinality", low_card_preprocessor, low_card),])

xgb_params = {
    'tree_method' : 'hist',
    'learning_rate': 0.065733165999498,
    'n_estimators': 500,
    'max_depth': 8,
    'lambda': 0.008103395718032142,
    'alpha': 0.37923490995294745,
    'colsample_bytree': 0.5,
    'min_child_weight': 19,
    'subsample': 1.0,
    'random_state' : 420,
    'use_label_encoder':False}

xgb = XGBClassifier(**xgb_params)

xgb_steps = [('preprocess', preprocessor),
             ('regressor', xgb),]

xgb_pipeline = Pipeline(xgb_steps)
xgb_pipeline



In [7]:
xgb_pipeline.fit(X_train, y_train)

In [8]:
print(text_cols)

['katakunci', 'judul_berita', 'konten_berita', 'nama_tokoh', 'jabatan', 'organisasi', 'lokasi', 'alias', 'kutipan']


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
# from mlxtend.feature_selection import ColumnSelector

sgd_clf = SGDClassifier(
    loss='modified_huber',
    penalty='l1',
    alpha=1e-4,
    max_iter=45000,
    tol=6e-6,
    shuffle=True,
    verbose=False,
    random_state=420,
    learning_rate='adaptive',
    eta0=0.04,
    n_iter_no_change=237,)

# tfidf_1 = TfidfVectorizer()
# tfidf_2 = TfidfVectorizer()
tfidf_3 = TfidfVectorizer()

# nlp_1 = ColumnTransformer([
#     ("tfidf", tfidf_1, "katakunci"),
# ])

# nlp_2 = ColumnTransformer([
#     ("tfidf", tfidf_2, "judul_berita"),
# ])

nlp_3 = ColumnTransformer([
    ("tfidf", tfidf_3, "konten_berita"),
])

nlp_preprocessor = FeatureUnion([
#     ('katakunci', nlp_1),
#     ('judul_berita', nlp_2),
    ('konten_berita', nlp_3),
])

sgd_steps = ([
    ('preprocess', nlp_preprocessor),
    ('regressor', sgd_clf),
])

sgd_pipeline = Pipeline(sgd_steps)

sgd_pipeline

In [10]:
from sklearn.neural_network import MLPClassifier
from category_encoders import TargetEncoder

target_encoder = TargetEncoder()

high_card_mlp_prep = Pipeline(steps=[
        ("target_encoder", target_encoder)])

low_card_mlp_prep = Pipeline(steps=[
        ("oh_encoder", oh_encoder),])

mlp_preprocessor = ColumnTransformer([
        ("high_cardinality", high_card_mlp_prep, high_card),
        ("low_cardinality", low_card_mlp_prep, low_card),])

mlp = MLPClassifier(
    hidden_layer_sizes = (256,2),
    activation = 'relu',
    solver = 'adam',
    alpha = 0.0002,
    batch_size = 'auto',
    learning_rate = 'adaptive',
    learning_rate_init = 0.001,
    max_iter = 2000,
    shuffle = True,
    random_state = 420,
    tol = 1e-8,
    verbose = False,)

mlpr_steps = [('preprocess', mlp_preprocessor),
             ('regressor', mlp),]
mlp_pipeline = Pipeline(mlpr_steps)

mlp_pipeline



In [11]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

l2_model = LogisticRegression()

l1_models = [
    ("xgb", xgb_pipeline),
    ("mlp", mlp_pipeline),
    ('sgd', sgd_pipeline),
    #('lgbm', lgbm_pipeline),
]

stacking_clf = StackingClassifier(estimators=l1_models,
                              final_estimator=l2_model,
                              cv=5,
                              stack_method='predict',
                              n_jobs=-1,
                              passthrough=False,
                              verbose=1)
stacking_clf

In [12]:
stacking_clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.3s remaining:   18.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   30.1s remaining:   45.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.6s finished


In [13]:
from sklearn.metrics import classification_report
val_preds = stacking_clf.predict(X_train)

print(classification_report(y_train, val_preds))

              precision    recall  f1-score   support

           0       0.63      0.53      0.57      8240
           1       0.59      0.46      0.52     16982
           2       0.75      0.89      0.81     27667

    accuracy                           0.69     52889
   macro avg       0.65      0.62      0.63     52889
weighted avg       0.68      0.69      0.68     52889



In [14]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,sumber,kodekat,kodesubkat,kategori,subkategori,katakunci,tanggal,judul_berita,konten_berita,nama_tokoh,jabatan,organisasi,lokasi,alias,kutipan,sentimen_kutipan
0,b67b9e3d9b8,Detik,A,A2,"Pertanian, Kehutanan, dan Perikanan",Kehutanan dan Penebangan Kayu,bambu,2021-02-16,"Viral Pemasungan ODGJ di Bandung Barat, Begini...",Bandung Barat - Jagat dunia maya dihebohkan de...,"['Nanang', 'Bunyamin', 'Eli', 'Lihat']",['Kepala Desa Sindangkerta'],[],"['Bandung Barat', 'Desa Sindangkerta', 'Kecama...",['rumah sakit'],"['""Dari usia 19 tahun jadi sepertu ini (ganggu...",Negatif
1,b687d3fba37,Detik,A,A1.a,"Pertanian, Kehutanan, dan Perikanan","Pertanian, Peternakan, Perburuan dan Jasa Pert...",Produksi Kedelai,2021-05-21,"7 Makanan untuk Penderita Anemia, yang Harus D...",Jakarta -\n\nPenderita anemia perlu mengasup m...,[],[],[],['Jakarta'],[],[],Netral
2,b68e2851dd8,Antara,A,A3,"Pertanian, Kehutanan, dan Perikanan",Perikanan,hasil tangkap nelayan,2021-05-20,Kapal Patroli Polri gagalkan pengeboman ikan d...,Keempat pelaku mengaku akan melakukan pengebom...,[],"['anggota KP', 'Kepala Sub Bidang Penegakan Hu...","['Mabes Polri', 'Direktorat Polair Polda Papua...","['Provinsi Papua Barat', 'Papua Barat', 'Kabup...",[],['mengatakan bahwa saat melakukan patroli di p...,Negatif
3,b6914e3beeb,Detik,A,A1.c,"Pertanian, Kehutanan, dan Perikanan","Pertanian, Peternakan, Perburuan dan Jasa Pert...",Produksi Tebu,2021-01-25,"Rapat Bareng DPR, Mentan Dicecar soal Kedelai ...",Jakarta -\n\nKomisi IV DPR RI hari ini menggel...,['Sudin'],"['Menteri Pertanian (Mentan)', 'Ketua Komisi I...","['Kementerian Pertanian (Kementan)', 'Kementer...","['Jakarta', 'Jakarta Pusat', 'Australia']","['Impor', 'impor']","['""Rapat ini kami buka dan dinyatakan terbuka ...",Negatif
4,b6916aab106,Antara,Q,Q,Jasa Kesehatan dan Kegiatan Sosial Swasta,Jasa Kesehatan dan Kegiatan Sosial Swasta,wabah penyakit,2021-01-21,Berkas kasus RS UMMI diserahkan tahap I ke Kej...,Berkas perkara RS Ummi sudah tahap satu kemari...,"['Agung', 'Andi Rian Djajadi', 'Rizieq Shihab'...",['Direktur Tindak Pidana Umum Bareskrim Polri ...,"['Direktorat Tindak Pidana Umum', 'Bareskrim P...","['Berkas', 'Jakarta', 'Bogor', 'Jawa Barat']",[],"['""Berkas perkara RS Ummi sudah tahap satu kem...",Negatif


In [15]:
X_test = test[high_card + low_card + text_cols]

In [16]:
y_preds = stacking_clf.predict(X_test)

In [17]:
submission = pd.read_csv('sample_submission.csv')

In [18]:
submission.head()

Unnamed: 0,id,sentimen_berita
0,b67b9e3d9b8,Netral
1,b687d3fba37,Netral
2,b68e2851dd8,Netral
3,b6914e3beeb,Netral
4,b6916aab106,Netral


In [19]:
submission['sentimen_berita'] = y_preds
submission['sentimen_berita'] = submission['sentimen_berita'].map({0:'Negatif', 1:'Netral', 2:'Positif'})

In [20]:
submission.head()

Unnamed: 0,id,sentimen_berita
0,b67b9e3d9b8,Negatif
1,b687d3fba37,Negatif
2,b68e2851dd8,Negatif
3,b6914e3beeb,Negatif
4,b6916aab106,Negatif


In [21]:
# submission.to_csv('submission_dabes1.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 12.3min remaining: 18.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.8min finished
