In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [2]:
df= pd.read_pickle("df_modeling.pkl")

In [3]:
# Create a boolean mask to select rows where the label is not equal to 2
mask = df['label'] != 2

# Use the mask to filter the DataFrame
df_no2 = df[mask]

In [4]:
Y_no2 = df_no2["label"]
X_no2 = df_no2.drop("label", axis=1)

In [5]:
X_text2 = X_no2[["title", "content"]]

In [6]:
X_text2.shape

(63178, 2)

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download("punkt")
import warnings

warnings.filterwarnings(action="ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
title_sentences = [sent_tokenize(text) for text in X_text2["title"]]
content_sentences = [sent_tokenize(text) for text in X_text2["content"]]

tokenized_title_sentences = [[word_tokenize(sentence) for sentence in sentence_list] for sentence_list in title_sentences]
tokenized_content_sentences = [[word_tokenize(sentence) for sentence in sentence_list] for sentence_list in content_sentences]

tokenized_sentences = [sentence for sentence_list in tokenized_title_sentences + tokenized_content_sentences for sentence in sentence_list]

In [27]:
model = Word2Vec(tokenized_sentences, min_count=1, vector_size=1000)

In [28]:
title_feature_vectors = []
content_feature_vectors = []
for title, content in zip(X_text2["title"], X_text2["content"]):
    title_words = word_tokenize(title)
    content_words = word_tokenize(content)
    title_word_vectors = [model.wv[word] for word in title_words if word in model.wv]
    content_word_vectors = [model.wv[word] for word in content_words if word in model.wv]
    if title_word_vectors:
        title_feature_vector = np.mean(title_word_vectors, axis=0)
    else:
        title_feature_vector = np.zeros(model.vector_size)
    if content_word_vectors:
        content_feature_vector = np.mean(content_word_vectors, axis=0)
    else:
        content_feature_vector = np.zeros(model.vector_size)
    title_feature_vectors.append(title_feature_vector)
    content_feature_vectors.append(content_feature_vector)

In [29]:
X = np.hstack((title_feature_vectors, content_feature_vectors))
print(X.shape)

(63178, 2000)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, Y_no2, test_size=0.2)

In [31]:
Light = lgb.Dataset(X_train, label=y_train)

# Set the hyperparameters for the LightGBM model
paramsLight = {
    'boosting_type': 'gbdt',
    'objective': 'binary',  # Set the appropriate objective for your classification task
    'metric': 'binary_logloss',  # Set the appropriate metric for evaluation
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [32]:
clf_Light = lgb.train(paramsLight, Light, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.


In [33]:
y_pred_Light = clf_Light.predict(X_test)
y_pred_Light = [round(pred) for pred in y_pred_Light]

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
accuracy = accuracy_score(y_test, y_pred_Light)
print("Accuracy LightGBM vect:", accuracy)

Accuracy LightGBM vect: 0.8023108578664134


In [36]:
print(classification_report(y_test, y_pred_Light))

              precision    recall  f1-score   support

         0.0       0.80      0.88      0.84      7247
         1.0       0.81      0.70      0.75      5389

    accuracy                           0.80     12636
   macro avg       0.80      0.79      0.79     12636
weighted avg       0.80      0.80      0.80     12636



In [42]:
pd.crosstab(y_test, y_pred_Light, rownames=["real"], colnames=["predict"])

predict,0,1
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,6353,894
1.0,1604,3785


In [100]:
from sklearn.ensemble import RandomForestClassifier

clf_r = RandomForestClassifier(n_jobs=-1, criterion="entropy", class_weight={0: 3, 1: 1}, n_estimators=200)
clf_r.fit(X_train, y_train)

In [101]:
y_pred = clf_r.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,6303,944
1.0,1633,3756


In [102]:
clf_r.score(X_test, y_test)

0.7960588793922128

In [104]:
from sklearn.model_selection import GridSearchCV

# define the parameter values that should be searched
max_depth_range = range(1, 21)
class_weight_options = [None, {0: 1, 1: 2}, {0: 2, 1: 1}]

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(max_depth=max_depth_range, class_weight=class_weight_options)

# instantiate and fit the grid
grid = GridSearchCV(RandomForestClassifier(criterion="entropy", n_estimators=200), param_grid, cv=10, scoring="accuracy")
grid.fit(X_train, y_train)

# view the results
print(grid.cv_results_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)

KeyboardInterrupt: 

In [None]:
y_pred = grid.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [95]:
from sklearn.linear_model import LogisticRegression

clf_LogReg = LogisticRegression(class_weight={0: 3, 1: 1})
clf_LogReg.fit(X_train, y_train)

In [96]:
clf_LogReg.score(X_test, y_test)

0.7704969927192149

In [97]:
y_pred = clf_LogReg.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,6898,349
1.0,2551,2838


In [55]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_validate, KFold

In [56]:
cv3 = KFold(n_splits=3, shuffle=True)

In [58]:
sclf = StackingClassifier([("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_r, clf_LogReg, sclf], ["Random Forest", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


Random Forest accuracy:  0.7918959099015884 0.0023634192484273853 f1:  0.7240497597929108 0.003273990457371601
Logistic Regression accuracy:  0.8061415923675442 0.005972955049627519 f1:  0.7617389111875701 0.008213197499922985
Stacking Classifier accuracy:  0.816924542170712 0.00043799552617826177 f1:  0.7738541651306335 0.001167305686198011
