In [75]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_validate, KFold

In [76]:
df= pd.read_pickle("strats_new.pkl")

In [77]:
df.dropna(subset=['label'], axis = 0, inplace=True)

In [78]:
# Create a boolean mask to select rows where the label is not equal to 2
mask = df["label"] != 2

# Use the mask to filter the DataFrame
df_no2 = df[mask]

In [79]:
# Seperating target and data
Y_no2 = df_no2["label"]
X_no2 = df_no2.drop("label", axis=1)

Count Vectorizer

In [6]:

# Create two CountVectorizer objects
vectorizer1 = CountVectorizer()
vectorizer2 = CountVectorizer()

# Seperating content and title(what we want to predict on)
# Fit the vectorizers on the training data
vectorizer1.fit(X_no2["content"])
vectorizer2.fit(X_no2["title"])

# Transform the text data
X_content = vectorizer1.transform(X_no2["content"])
X_title = vectorizer2.transform(X_no2["title"])

# Combine the feature matrices
X_text = hstack([X_content, X_title])

In [7]:
X_text.shape

(63178, 266549)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_text, Y_no2, test_size=0.2)

In [18]:
Light = lgb.Dataset(X_train.astype(float), label=y_train.astype(float))

# Set the hyperparameters for the LightGBM model
paramsLight = {
    'boosting_type': 'gbdt',
    'objective': 'binary',  # Set the appropriate objective for your classification task
    'metric': 'binary_logloss',  # Set the appropriate metric for evaluation
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [19]:
clf_Light = lgb.train(paramsLight, Light, num_boost_round=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [21]:
y_pred_Light = clf_Light.predict(X_test.astype(float))
y_pred_Light = [round(pred) for pred in y_pred_Light]

In [22]:
accuracy = accuracy_score(y_test, y_pred_Light)
print("Accuracy LightGBM vect:", accuracy)

Accuracy LightGBM vect: 0.8185343463121241


In [23]:
print(classification_report(y_test, y_pred_Light))

              precision    recall  f1-score   support

         0.0       0.81      0.91      0.85      7386
         1.0       0.84      0.69      0.76      5250

    accuracy                           0.82     12636
   macro avg       0.82      0.80      0.81     12636
weighted avg       0.82      0.82      0.82     12636



In [24]:
pd.crosstab(y_test, y_pred_Light, rownames=["real"], colnames=["predict"])

predict,0,1
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,6702,684
1.0,1609,3641


In [25]:

clf_r = RandomForestClassifier(n_jobs=-1, criterion="entropy", class_weight={0: 1, 1: 1}, n_estimators=250)

clf_r.fit(X_train, y_train)

In [26]:
y_pred = clf_r.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,7162,224
1.0,2475,2775


In [27]:
clf_r.score(X_test, y_test)

0.7864039252928142

In [36]:
from sklearn.model_selection import GridSearchCV

# define the parameter values that should be searched
max_depth_range = range(1, 6)
class_weight_options = [None, {0: 1, 1: 2}, {0: 2, 1: 1}]

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(max_depth=max_depth_range, class_weight=class_weight_options)

# instantiate and fit the grid
grid = GridSearchCV(RandomForestClassifier(criterion="entropy", n_estimators=200), param_grid, cv=10, scoring="accuracy")
grid.fit(X_train, y_train)

# view the results
print(grid.cv_results_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)

{'mean_fit_time': array([10.88176446, 20.64178958, 29.77688675, 40.6080337 , 49.98455341,
       10.85015652, 20.60308149, 29.72577655, 40.13111219, 49.51005652,
       10.77073739, 19.34440422, 28.24218476, 38.03789611, 62.83292432]), 'std_fit_time': array([ 0.63774245,  0.44347107,  0.40829983,  0.50577632,  0.60904943,
        0.33622755,  0.89090477,  1.46922183,  1.9053031 ,  1.92335138,
        0.32051858,  0.52072859,  0.44446972,  0.71633439, 45.84062066]), 'mean_score_time': array([2.05883989, 2.26363194, 2.23230011, 2.24993219, 2.15684509,
       2.01505158, 2.1401799 , 2.0903424 , 2.17078445, 2.18057592,
       2.08683109, 2.00134909, 1.98575418, 2.06207812, 1.99994576]), 'std_score_time': array([0.13342382, 0.10395437, 0.06184019, 0.10022314, 0.14764711,
       0.10549649, 0.10731165, 0.12785392, 0.20168025, 0.30804281,
       0.11468207, 0.09106539, 0.0921817 , 0.20919341, 0.06043941]), 'param_class_weight': masked_array(data=[None, None, None, None, None, {0: 1, 1: 2},
  

In [None]:
y_pred = grid.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [28]:

clf_LogReg = LogisticRegression(class_weight={0: 1, 1: 1})
clf_LogReg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
clf_LogReg.score(X_test, y_test)

0.8207502374169041

In [30]:
y_pred = clf_LogReg.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,6232,1154
1.0,1111,4139


In [6]:
cv3 = KFold(n_splits=3, shuffle=True)

In [32]:
sclf = StackingClassifier([("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_r, clf_LogReg, sclf], ["Random Forest", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


Random Forest accuracy:  0.7728819002702253 0.0028233459409617947 f1:  0.6499601646453573 0.0050529799889636965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression accuracy:  0.811879219222185 0.0025210900162312587 f1:  0.7744459362204147 0.0029552951344835655


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking Classifier accuracy:  0.8445452375289868 0.0038168990564818152 f1:  0.8092388110107404 0.005368894670413464


In [39]:
import warnings

warnings.filterwarnings(action="ignore")

TfidfVectorizer

In [67]:
# Create two CountVectorizer objects
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()

# Seperating content and title(what we want to predict on)
# Fit the vectorizers on the training data
vectorizer1.fit(X_no2["content"])
vectorizer2.fit(X_no2["title"])

# Transform the text data
X_content = vectorizer1.transform(X_no2["content"])
X_title = vectorizer2.transform(X_no2["title"])

# Combine the feature matrices
X_text = hstack([X_content, X_title])

In [9]:
X_text.shape

(111280, 352812)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_text, Y_no2, test_size=0.2)

In [20]:
Light = lgb.Dataset(X_train.astype(float), label=y_train.astype(float))

# Set the hyperparameters for the LightGBM model
paramsLight = {
    'boosting_type': 'gbdt',
    'objective': 'binary',  # Set the appropriate objective for your classification task
    'metric': 'binary_logloss',  # Set the appropriate metric for evaluation
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [21]:
clf_Light = lgb.train(paramsLight, Light, num_boost_round=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [22]:
y_pred_Light = clf_Light.predict(X_test.astype(float))
y_pred_Light = [round(pred) for pred in y_pred_Light]

In [23]:
accuracy = accuracy_score(y_test, y_pred_Light)
print("Accuracy LightGBM vect:", accuracy)

Accuracy LightGBM vect: 0.8230140186915887


In [24]:
print(classification_report(y_test, y_pred_Light))

              precision    recall  f1-score   support

         0.0       0.80      0.90      0.85     12239
         1.0       0.85      0.73      0.79     10017

    accuracy                           0.82     22256
   macro avg       0.83      0.81      0.82     22256
weighted avg       0.83      0.82      0.82     22256



In [25]:
pd.crosstab(y_test, y_pred_Light, rownames=["real"], colnames=["predict"])

predict,0,1
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10990,1249
1.0,2690,7327


In [40]:
clf_LogReg = LogisticRegression(C=1.0, class_weight={0: 1, 1: 1})

clf_LogReg.fit(X_train, y_train)

In [10]:
clf_LogReg.score(X_test, y_test)

0.8563533429187635

In [11]:
y_pred = clf_LogReg.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10896,1417
1.0,1780,8163


In [12]:
clf_r = RandomForestClassifier(n_jobs=-1, criterion="entropy", class_weight={0: 1, 1: 1}, n_estimators=300)

clf_r.fit(X_train, y_train)

In [13]:
y_pred = clf_r.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11767,546
1.0,3663,6280


In [14]:
clf_r.score(X_test, y_test)

0.8108824586628325

In [6]:
cv3 = KFold(n_splits=3, shuffle=True)

In [35]:
sclf = StackingClassifier([("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_r, clf_LogReg, sclf], ["Random Forest", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


Random Forest accuracy:  0.7969760654944075 0.002566258429881316 f1:  0.7265889033836458 0.004098553178359794
Logistic Regression accuracy:  0.849287839775017 0.0021945291144262313 f1:  0.8286327566436927 0.0023571169836699235
Stacking Classifier accuracy:  0.8554322622597176 0.0027945600335521578 f1:  0.8352977333253433 0.0024296825662476007


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

In [22]:
clf_sgd = SGDClassifier(loss="log_loss", penalty="l2")

In [53]:
clf_sgd.fit(X_train, y_train)

In [54]:
clf_sgd.score(X_test, y_test)

0.829304457225018

In [55]:
y_pred = clf_sgd.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10738,1501
1.0,2298,7719


In [57]:
clf_pa = PassiveAggressiveClassifier()

In [58]:
clf_pa.fit(X_train, y_train)

In [60]:
clf_pa.score(X_test, y_test)

0.8231937455068297

In [59]:
y_pred = clf_pa.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10253,1986
1.0,1949,8068


In [62]:
sclf = StackingClassifier([("sgd", clf_sgd), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_sgd, clf_LogReg, sclf], ["SGD", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


SGD accuracy:  0.8289899779040448 0.0030528223992902868 f1:  0.8026784202427933 0.005297866804890007
Logistic Regression accuracy:  0.848490297742063 0.0008198240320819644 f1:  0.8273662547590405 0.0019512916990468414
Stacking Classifier accuracy:  0.8511075756724903 0.000734561964675563 f1:  0.8318710380796164 0.0008162038003139373


In [63]:
sclf = StackingClassifier([("sgd", clf_sgd), ("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_sgd, clf_r, clf_LogReg, sclf], ["SGD", "Random Forest", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


SGD accuracy:  0.8285181575384458 0.0005571326507739278 f1:  0.7998675768779052 0.0023185661552484727
Random Forest accuracy:  0.795852850247235 0.0030381447854116385 f1:  0.7254554145740868 0.0029787982111762258
Logistic Regression accuracy:  0.8485240177181762 0.0015426484792590422 f1:  0.8275360179977408 0.001883804041879282
Stacking Classifier accuracy:  0.8622281647975206 0.0009272646980652502 f1:  0.8434776812229298 0.001370958504893409


In [16]:
clf_pa = PassiveAggressiveClassifier(class_weight={0: 1, 1: 11})

In [17]:
clf_pa.fit(X_train, y_train)

In [20]:
clf_pa.score(X_test, y_test)

0.8214414090582315

In [18]:
y_pred = clf_pa.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10291,2022
1.0,1895,8048


In [20]:
sclf = StackingClassifier([("pa", clf_pa), ("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

for clf, label in zip([clf_pa, clf_r, clf_LogReg, sclf], ["PassivAggressive", "Random Forest", "Logistic Regression", "Stacking Classifier"]):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=["accuracy", "f1"])
    print(label, "accuracy: ", scores["test_accuracy"].mean(), scores["test_accuracy"].std(), "f1: ", scores["test_f1"].mean(), scores["test_f1"].std())


PassivAggressive accuracy:  0.8153755705759017 0.004098076586133802 f1:  0.7992876026853774 0.0023455397676276223
Random Forest accuracy:  0.7979421195866183 0.0015507307207239945 f1:  0.7288315370934083 0.0024130988422102163
Logistic Regression accuracy:  0.8481982400268824 0.000652151727774622 f1:  0.8276402773355468 0.00015644388571634947
Stacking Classifier accuracy:  0.857611424659054 0.0003818285093331509 f1:  0.8385617152856119 0.0005079005941878797


In [21]:
sclf.fit(X_train, y_train)

In [23]:
sclf.score(X_test, y_test)

0.8639468008626887

In [24]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11012,1301
1.0,1727,8216


In [25]:
sclf = StackingClassifier([("sgd", clf_sgd), ("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

In [26]:
sclf.fit(X_train, y_train)

In [27]:
sclf.score(X_test, y_test)

0.8681703810208483

In [28]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11065,1248
1.0,1686,8257


In [29]:
sclf = StackingClassifier([("rf", clf_r), ("sgd", clf_sgd), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

In [31]:
sclf.fit(X_train, y_train)

In [32]:
sclf.score(X_test, y_test)

0.8684849029475198

In [33]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11061,1252
1.0,1675,8268


In [6]:
vectorizer1 = CountVectorizer()
vectorizer2 = CountVectorizer()

vectorizer1.fit(X_no2["content"])
vectorizer2.fit(X_no2["title"])

X_content = vectorizer1.transform(X_no2["content"])
X_title = vectorizer2.transform(X_no2["title"])

X_text = hstack([X_content, X_title])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_text, Y_no2, test_size=0.2)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

# define the parameter values that should be searched
loss_options = ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"]
penalty_options = ["l2", "l1", "elasticnet"]
class_weight_options = [None, {0: 1, 1: 2}, {0: 2, 1: 1}]

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(loss=loss_options, penalty=penalty_options, class_weight=class_weight_options)

# instantiate and fit the grid
grid = GridSearchCV(SGDClassifier(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

# view the results
print(grid.cv_results_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)









{'mean_fit_time': array([  3.96334257, 245.70420299,  14.94858661,   5.53701153,
       287.95354195,  19.64705524,   4.48801751, 229.98812361,
        19.90653095,   2.68335032, 220.05032096,  10.35839682,
         4.2782937 , 241.52311993,  14.61940389,   4.38795428,
       235.88222995,  16.12529488,   4.88708301, 252.58755503,
        26.87317858,   4.40408959, 225.9673542 ,  21.23774867,
         2.40014191, 219.56167245,  10.43870945,   4.32532039,
       223.06977487,  20.66610751,   4.48095946, 193.26106653,
        13.70579324,   6.14914184, 267.43623872,  20.39977789,
         4.9789362 , 228.30916777,  16.99407396,   2.20516634,
       210.90709634,   8.86782994,   5.22545595, 243.63180184,
        18.74309845]), 'std_fit_time': array([ 0.90574522,  4.36022957,  4.14629206,  0.6393986 ,  4.59213184,
        8.05382489,  0.98713638,  3.1409743 ,  3.85258824,  0.8527116 ,
        2.02511916,  2.26266543,  0.77288526,  1.90753416,  6.40505861,
        0.74395517,  1.69579429,  

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define the parameter values that should be searched
C_range = np.logspace(-3, 3, 7)
penalty_options = ["l1", "l2", "elasticnet", "none"]
class_weight_options = [None, {0: 1, 1: 2}, {0: 2, 1: 1}]
solver_options = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(C=C_range, penalty=penalty_options, class_weight=class_weight_options, solver=solver_options)

# instantiate and fit the grid
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

# view the results
print(grid.cv_results_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



In [9]:
clf_sgd = SGDClassifier(loss="hinge", penalty="elasticnet")

In [14]:
clf_sgd.fit(X_train, y_train)

In [15]:
clf_sgd.score(X_test, y_test)

0.8329439252336449

In [16]:
y_pred = clf_sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86     12146
         1.0       0.88      0.74      0.80     10110

    accuracy                           0.83     22256
   macro avg       0.84      0.82      0.83     22256
weighted avg       0.84      0.83      0.83     22256



In [10]:
clf_r = RandomForestClassifier(n_jobs=-1, criterion="entropy", class_weight={0: 1, 1: 1}, n_estimators=400)

In [17]:
clf_r.fit(X_train, y_train)

In [18]:
clf_r.score(X_test, y_test)

0.8062994248741913

In [20]:
y_pred = clf_r.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.75      0.97      0.84     12146
         1.0       0.94      0.61      0.74     10110

    accuracy                           0.81     22256
   macro avg       0.84      0.79      0.79     22256
weighted avg       0.84      0.81      0.80     22256



In [58]:
clf_LogReg = LogisticRegression(C=0.1, max_iter=1000, penalty="l1", solver="liblinear")

In [59]:
clf_LogReg.fit(X_train, y_train)

In [60]:
clf_LogReg.score(X_test, y_test)

0.8485352264557872

In [61]:
sclf = StackingClassifier([("rf", clf_r), ("sgd", clf_sgd), ("lr", clf_LogReg)], final_estimator=clf_LogReg)
sclf.fit(X_train, y_train)

In [64]:
sclf.score(X_test, y_test)

0.8651150251617541

In [62]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10538,1608
1.0,1394,8716


In [None]:
print(classification_report(y_test, y_pred))

In [65]:
from joblib import dump, load

In [66]:
dump(sclf, "model.joblib")

['model.joblib']

In [70]:
sclf = StackingClassifier([("rf", clf_r), ("sgd", clf_sgd), ("lr", clf_LogReg)], final_estimator=clf_LogReg)
sclf.fit(X_train, y_train)

In [71]:
sclf.score(X_test, y_test)

0.8515456506110711

In [72]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10782,1466
1.0,1838,8170


In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.88      0.87     12248
         1.0       0.85      0.82      0.83     10008

    accuracy                           0.85     22256
   macro avg       0.85      0.85      0.85     22256
weighted avg       0.85      0.85      0.85     22256



In [74]:
dump(sclf, "model_tf.joblib")

['model_tf.joblib']

In [81]:
clf_LogReg = LogisticRegression(C=1.0, class_weight={0: 1, 1: 1})

In [82]:
sclf = StackingClassifier([("sgd", clf_sgd), ("rf", clf_r), ("lr", clf_LogReg)], final_estimator=clf_LogReg)

In [84]:
sclf.fit(X_train, y_train)

In [85]:
y_pred = sclf.predict(X_test)

pd.crosstab(y_test, y_pred, rownames=["real"], colnames=["predict"])

predict,0.0,1.0
real,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,10970,1287
1.0,1754,8245


In [87]:
sclf.score(X_test, y_test)

0.863362688713156

In [86]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.86      0.89      0.88     12257
         1.0       0.86      0.82      0.84      9999

    accuracy                           0.86     22256
   macro avg       0.86      0.86      0.86     22256
weighted avg       0.86      0.86      0.86     22256



In [88]:
dump(sclf, "model_tf.joblib")

['model_tf.joblib']