# Part I: Data Gathering and Preprocessing

### Importing Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_text as text

2023-11-01 15:11:04.965208: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-01 15:11:05.006317: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-01 15:11:05.006354: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-01 15:11:05.006386: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-01 15:11:05.013589: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-01 15:11:05.014076: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

### Importing scikit-learn classifiers

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

### Importing Data

In [15]:
data = pd.read_csv("/home/unknown/AI Project/Combined.csv", header=0)
data.head(5)

Unnamed: 0,Text,Category
0,"\nOnce upon a time, there were four friends na...",0
1,"\nSure, here is a story about a character who ...",0
2,"\nSure, here is a story about a character who ...",0
3,"\nSure, here is a story about a person who fin...",0
4,"\nSure, here is a story set in a world where p...",0


# Part II: Training

In [16]:
X = data['Text']
Y = data['Category']
# vector = CountVectorizer()
# counts = vector.fit_transform(data['Text'].values)
# cat = data['Category'].values

### Using various classification models and targetting 'Category'

In [17]:
NB_Model = MultinomialNB()
RFC_Model = RandomForestClassifier()
SVC_Model = SVC(probability=True)
KNC_Model = KNeighborsClassifier()
DTC_Model = DecisionTreeClassifier()

### Checking the accuracy using 90/10 train/test split

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

In [26]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
RFCtest = RFC_Model.fit(X_train_vectorized, Y_train)

# RFCtest = RandomForestClassifier().fit(X_train,Y_train)
acc_rfc = RFCtest.score(X_test_vectorized, Y_test)
print('The Random Forest Algorithm has an accuracy of', acc_rfc*100)
rf_predictions = RFC_Model.predict(X_test_vectorized)
# 93

The Random Forest Algorithm has an accuracy of 88.14229249011858


### Entering text to predict the category

In [27]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

2023-11-01 15:49:33.290209: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.


<h4>Build Model</h4>

In [28]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [29]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [31]:
model.fit(X_train, Y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f86107fc070>

In [32]:
model.evaluate(X_test, Y_test)



[-2.686659574508667,
 0.30039525032043457,
 0.7008928656578064,
 0.940119743347168]

In [38]:
import numpy as np
bert_predictions = model.predict(X_test)
y_pred_classes = np.argmax(bert_predictions, axis=1)
accuracy = accuracy_score(Y_test, y_pred_classes)
print(f'Accuracy: {accuracy * 100}%')
# print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 33.99209486166008%


In [34]:
# rf_predictions = RFC_Model.predict(X_test)

In [35]:
prediction_df = pd.DataFrame({'RF_Predictions': rf_predictions, 'BERT_Predictions': bert_predictions[:, 0]})

In [40]:
# new_text_samples = ["There was a quiet whisper in the wind that scratched against her skin. Amber lamplight gleamed off her eyes. She shone under the moon's gaze, the faint glow brightening her ashen skin, her eyes dull and hollow other than the soft glimmer of the street lamps"]
# new_predictions = model.predict(new_text_samples)

# print(f"ChatGPT --> {(100-(new_predictions[0][0])* 100):.2f} %\nHuman --> {(new_predictions[0][0])*100:.2f} %")

ChatGPT --> 0.00 %
Human --> 100.00 %


In [39]:
from sklearn.linear_model import LogisticRegression
meta_learner = LogisticRegression()
meta_learner.fit(prediction_df, Y_test)

# Make predictions with the meta-learner
meta_predictions = meta_learner.predict(prediction_df)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(Y_test, meta_predictions)
print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100}%')
# print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100:.2f}%')

Ensemble Model Accuracy: 88.14229249011858%
