In [2]:
from helper import *

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU available")

No GPU available


In [4]:
data = pd.read_csv("data.csv")
print(data.shape)

(5572, 7480)


In [5]:
X = data["text_clean"]
X = pd.Series(map(str, X))
y = data["y"]
print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5572,) (5572,)
(4457,) (4457,)
(1115,) (1115,)


In [6]:
MAX_NUM_WORDS = 2000
MAX_SEQUENCE_LENGTH = 100
X_sequence_train , tok = tokenize_text(X_train, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH)
X_sequence_test = pad_sequences(tok.texts_to_sequences(X_test), maxlen=MAX_SEQUENCE_LENGTH)
print(X_sequence_train.shape, X_sequence_test.shape)

(4457, 100) (1115, 100)


In [7]:
def make_model(name, 
               embedding_dim=128, 
               lstm_units=128, 
               dense_units=64,
               dropout_rate=0.2,
            ):
    model = Sequential(name=name)
    model.add(Embedding(input_dim=MAX_NUM_WORDS, output_dim=embedding_dim))
    model.add(LSTM(units=lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))
    model.summary()
    return model
        
param_grid = {
        'embedding_dim': [50],
        'lstm_units': [48],
        'dense_units': [64],
        'dropout_rate': [0.2],
    }   

r = grid_search(param_grid, make_model, X_sequence_train , y_train , X_sequence_test , y_test)[0]

Grid size: 1


Epoch 1/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - accuracy: 0.8205 - loss: 0.5269 - val_accuracy: 0.8857 - val_loss: 0.2663
Epoch 2/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.9288 - loss: 0.2184 - val_accuracy: 0.9574 - val_loss: 0.1784
Epoch 3/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 0.9775 - loss: 0.0998 - val_accuracy: 0.9641 - val_loss: 0.1086
Epoch 4/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.9872 - loss: 0.0470 - val_accuracy: 0.9686 - val_loss: 0.0873
Epoch 5/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9920 - loss: 0.0296 - val_accuracy: 0.9664 - val_loss: 0.1077
Epoch 6/20
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.9967 - loss: 0.0172 - val_accuracy: 0.9664 - val_loss: 0.1042
Epoch 7/20
[1m32/32[0m [32m━━━━

In [8]:
y2 = data["y"]
X2 = data.drop(columns=['class', 'text', 'No_of_Characters', 'No_of_Words', 'No_of_sentence', 'text_clean' ,'y'])
print(X2.shape, y2.shape)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=TEST_SPLIT, random_state=42)
print(X2_train.shape, y2_train.shape)
print(X2_test.shape, y2_test.shape)

(5572, 7473) (5572,)
(4457, 7473) (4457,)
(1115, 7473) (1115,)


In [9]:
def make_model(
                name,
                regularization,
                solver,
                C
            ):
    model = LogisticRegression(
        penalty=regularization,
        solver=solver,
        C=C,
        max_iter=1000,
        random_state=42,
        verbose=1
    )
    return model

param_grid = {
    'regularization' : ["l2"],
    'solver': ['lbfgs'],
    'C' : [10e5]
}

def train_f(model:LogisticRegression,X_train,y):
    model = model.fit(X_train,y)
    history = []
    return model , history 

r2 = grid_search(param_grid, make_model, X2_train, y2_train , X2_test , y2_test, train_f )[0]

Grid size: 1
Model: Model0
Parameters: {'regularization': 'l2', 'solver': 'lbfgs', 'C': 1000000.0, 'name': 'Model0'}
Metrics: {'confusion_matrix': array([[960,   5],
       [ 18, 132]], dtype=int64), 'classification_report': {'0': {'precision': 0.9815950920245399, 'recall': 0.9948186528497409, 'f1-score': 0.9881626351003603, 'support': 965.0}, '1': {'precision': 0.9635036496350365, 'recall': 0.88, 'f1-score': 0.9198606271777003, 'support': 150.0}, 'accuracy': 0.979372197309417, 'macro avg': {'precision': 0.9725493708297882, 'recall': 0.9374093264248704, 'f1-score': 0.9540116311390303, 'support': 1115.0}, 'weighted avg': {'precision': 0.9791612656941134, 'recall': 0.979372197309417, 'f1-score': 0.9789740241690607, 'support': 1115.0}}, 'roc_auc': 0.9374093264248704, 'fpr': array([0.        , 0.00518135, 1.        ]), 'tpr': array([0.  , 0.88, 1.  ]), 'history': []}


In [13]:
import time


modelL = r[0]
modelR: LogisticRegression = r2[0]
tt = time.time()

res = modelR.predict_proba(X2_train)[:,1]

tR = tt-time.time()
print(tt-time.time())
tt = time.time()

res2 = modelL.predict(X_sequence_train)

tL = tt-time.time()
print(tt-time.time())
actual = np.array(y_train)
print(y_train.shape,res.shape,res2.shape)
print(tL/tR)

-0.08572673797607422
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
-0.9272677898406982
(4457,) (4457,) (4457, 1)
10.816552825088163


In [14]:
def ev(s):
    center = 0.5
    res_ = np.zeros(res.shape)
    t = res > center + s
    t2 =  res > center - s
    j = 0
    for i,b in enumerate(t):
        if b and t2[i]:
            res_[i] = 1 if res2[i,0] > 0.5 else 0
        else:
            res_[i] = 1 if res_[i] > 0.5 else 0
        if res_[i] != actual[i]:
            j = j +1

    return j , res_

for s in [0.05,0.10,0.15,0.2,0.30,0.4,0.45,0.49,0.499999]:
    print(ev(s))

def p2(x,x_seq,r=0.499):
    out = np.zeros((x.shape[0],1))
    p = modelR.predict_proba(x)
    for i,v in enumerate(p):
        if 0.5-r < v[0] < 0.5+r:
            out[i] = 1-modelL.predict(x_seq[1,:].reshape(1,-1))[0,0]
        else:
            out[i] = p[i,0]
    out = 1-np.round(out)
    return out

(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(35, array([0., 1., 0., ..., 0., 0., 0.]))
(45, array([0., 1., 0., ..., 0., 0., 0.]))
(150, array([0., 1., 0., ..., 0., 0., 0.]))


In [15]:
np.round(modelL.predict(X_sequence_test[1,:].reshape(1,-1)))[0,0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


0.0

In [16]:
y_pred =  p2(X2_test,X_sequence_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
classification_report(y_test, y_pred,output_dict=True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18

{'0': {'precision': 0.964964964964965,
  'recall': 0.9989637305699481,
  'f1-score': 0.9816700610997964,
  'support': 965.0},
 '1': {'precision': 0.9913793103448276,
  'recall': 0.7666666666666667,
  'f1-score': 0.8646616541353384,
  'support': 150.0},
 'accuracy': 0.967713004484305,
 'macro avg': {'precision': 0.9781721376548963,
  'recall': 0.8828151986183075,
  'f1-score': 0.9231658576175674,
  'support': 1115.0},
 'weighted avg': {'precision': 0.9685184643434218,
  'recall': 0.967713004484305,
  'f1-score': 0.9659290198041293,
  'support': 1115.0}}