In [16]:
# import packages
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA,TruncatedSVD

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, LSTM, Embedding, SpatialDropout1D, Lambda, Reshape, Conv1D, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_hub as hub

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss

import time

import gc

from numba import cuda

In [17]:
from cuml.naive_bayes import MultinomialNB
from cuml.ensemble import RandomForestClassifier as cuRFC

In [18]:
import cudf, cuml
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
from cuml.ensemble import RandomForestClassifier as cumlRandomForestClassifier

In [19]:
import cupy as cp
from cuml.svm import SVC

In [20]:
num_sample = 100000
#num_sample = 50000
#num_sample = 100000

In [21]:
results = []

In [22]:
df = pd.read_csv('/kaggle/input/686-project/df_new.csv').sample(num_sample,random_state = 1234)
decode_map = {"negative":0, "positive":1}
df.target = df.target.apply(lambda x: decode_map[x])
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(df['text'].astype(str), df['target'],test_size=0.20,
                                                           random_state=1234,
                                                           stratify = df['target'])

In [23]:
%%time
# consider both unigrams and bigrams in vectorizer
#tokenizer = TfidfVectorizer(min_df = 0.0001, ngram_range = (1,2))
tokenizer = TfidfVectorizer(min_df = 0.001, ngram_range = (1,2))
tokenizer.fit(X_train)
X_train = tokenizer.transform(X_train)
X_test = tokenizer.transform(X_test)
X_train.shape

CPU times: user 3.25 s, sys: 15.9 ms, total: 3.27 s
Wall time: 3.3 s


(80000, 1071)

# KNN

In [24]:
%%time
model = cuKNeighbors()
start = time.time()
model.fit(X_train, y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test)
end_pred = time.time()
results.append(["KNN",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end-start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.6757105943152454
Accuracy of Model: 0.6486
Cross-entropy loss of Model: 12.137098018578849
CPU times: user 597 ms, sys: 260 ms, total: 857 ms
Wall time: 858 ms


# Naive Bayes

In [25]:
%%time
model = MultinomialNB()
start = time.time()
model.fit(X_train, y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test)
end_pred = time.time()
results.append(["Naive Bayes",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.7405994816879371
Accuracy of Model: 0.73475
Cross-entropy loss of Model: 9.16152358178643
CPU times: user 48 ms, sys: 3.14 ms, total: 51.2 ms
Wall time: 49.8 ms


# Random Forest

In [26]:
%%time
model = cuRFC()
start = time.time()
model.fit(X_train.toarray(), y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test.toarray())
end_pred = time.time()
results.append(["Random Forest",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions),end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

  ret_val = func(*args, **kwargs)
Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


F_1 Score of Model: 0.7350712726338771
Accuracy of Model: 0.69055
Cross-entropy loss of Model: 10.688213340257738
CPU times: user 6.45 s, sys: 1.67 s, total: 8.13 s
Wall time: 5.82 s


# SVM

In [27]:
%%time
model = SVC()
start = time.time()
model.fit(X_train.toarray(), y_train)
end = time.time()
start_pred = time.time()
predictions = model.predict(X_test.toarray())
end_pred = time.time()
results.append(["SVM",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.7524866657056365
Accuracy of Model: 0.74245
Cross-entropy loss of Model: 8.895579121472384
CPU times: user 19.8 s, sys: 773 ms, total: 20.6 s
Wall time: 20.6 s


# Xgboost

In [28]:
%%time
xgb = XGBClassifier(tree_method='gpu_hist', n_estimators = 388, eta = 0.22,subsample = 0.5, colsample_bytree = 0.2)
start = time.time()
xgb.fit(X_train,y_train)
end = time.time()
start_pred = time.time()
predictions = xgb.predict(X_test)
end_pred = time.time()
results.append(["Xgboost",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start, end_pred - start_pred])
print("F_1 Score of Model:",f1_score(y_test,predictions))
print("Accuracy of Model:",accuracy_score(y_test,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test,predictions))

F_1 Score of Model: 0.7495132722351487
Accuracy of Model: 0.73625
Cross-entropy loss of Model: 9.109726971376931
CPU times: user 1.89 s, sys: 13.8 ms, total: 1.91 s
Wall time: 1.67 s


In [29]:
results = np.array(results)
result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time','prediction time'])
print(result)

                 F-1  Accuracy  Log-loss  training time  prediction time
KNN            0.676     0.649    12.137          0.009            0.818
Naive Bayes    0.741     0.735     9.162          0.017            0.008
Random Forest  0.735     0.691    10.688          4.204            1.594
SVM            0.752     0.742     8.896         18.855            1.724
Xgboost        0.750     0.736     9.110          1.458            0.181


In [30]:
break

SyntaxError: 'break' outside loop (668683560.py, line 4)

Due to compational power, we choose not discover the RNN+LSTM, and discover more on BERT on the next notebook

In [None]:
model = keras.Sequential()
model.add(Input(shape = (X_train.shape[1],)))
#model.add(hub_layer)
model.add(tf.keras.layers.Reshape((X_train.shape[1],1)))
model.add(Bidirectional(LSTM(units=32)))
model.add(Dense(16))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
%%time
start = time.time()
batch_size = 128
model.fit(X_train.toarray(), y_train, epochs = 12, batch_size=batch_size, verbose = 2, shuffle=False)
end = time.time()

In [None]:
#predictions = (np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)>0.5))[:,1]
#y_test_sen = y_test_sen[:,1]
#results.append(["LSTM",f1_score(y_test,predictions), accuracy_score(y_test,predictions), log_loss(y_test,predictions), end - start])
print("F_1 Score of Model:",f1_score(y_test_sen,predictions))
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test_sen,predictions))

In [None]:
# results = np.array(results)
# result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time'])
# print(result)

In [None]:
break

In [None]:
df_sentence = pd.read_csv('/kaggle/input/686-project/df_new.csv')
decode_map = {"negative":0, "positive":1}
df_sentence.target = df_sentence.target.apply(lambda x: decode_map[x])
df_sentence = df_sentence.sample(num_sample,random_state = 1234)
label = np.array(pd.get_dummies(df_sentence.target), dtype=int)[:] 
# split train and test data
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(df_sentence['text'].astype(str), label,test_size=0.20,
                                                           random_state=1234,
                                                           stratify = label)

In [None]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")
embed = "https://tfhub.dev/google/universal-sentence-encoder/4"
#hub_layer = hub.KerasLayer(embed, input_shape=[], dtype=tf.string, trainable=False)
model = keras.Sequential()
model.add(Input(shape = (X_train.shape[1],))
#model.add(hub_layer)
model.add(tf.keras.layers.Reshape((512,1)))
model.add(LSTM(256))
model.add(Dense(128))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
%%time
start = time.time()
batch_size = 128
model.fit(X_train_sen, y_train_sen, epochs = 12, batch_size=batch_size, verbose = 2, shuffle=False)
end = time.time()

In [None]:
predictions = (np.ones((X_test_sen.shape[0],2))* (model.predict(X_test_sen)>0.5))[:,1]
y_test_sen = y_test_sen[:,1]
results.append(["LSTM",f1_score(y_test_sen,predictions), accuracy_score(y_test_sen,predictions), log_loss(y_test_sen,predictions),end-start])
print("F_1 Score of Model:",f1_score(y_test_sen,predictions))
print("Accuracy of Model:",accuracy_score(y_test_sen,predictions))
print("Cross-entropy loss of Model:",log_loss(y_test_sen,predictions))

In [None]:
results = np.array(results)
result = pd.DataFrame(np.round(results[:,1:].astype(float),3), index = results[:,0], columns = ['F-1','Accuracy','Log-loss','training time'])
print(result)