# Part I: Data Gathering and Preprocessing

### Importing Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

2023-11-17 18:01:20.017154: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 18:01:20.441646: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-17 18:01:20.441684: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-17 18:01:20.445582: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-17 18:01:20.743784: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-17 18:01:20.744922: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

### Importing scikit-learn classifiers

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

### Importing Data

In [11]:
data = pd.read_csv("/home/unknown/AI Project/Combined.csv", header=0)
data.head(5)

Unnamed: 0,Text,Category
0,"\nOnce upon a time, there were four friends na...",0
1,"\nSure, here is a story about a character who ...",0
2,"\nSure, here is a story about a character who ...",0
3,"\nSure, here is a story about a person who fin...",0
4,"\nSure, here is a story set in a world where p...",0


# Part II: Training

In [12]:
X = data['Text']
Y = data['Category']
# vector = CountVectorizer()
# counts = vector.fit_transform(data['Text'].values)
# cat = data['Category'].values

In [13]:
# Convert labels to one-hot encoded format (for multi-class classification)
Y_one_hot = tf.keras.utils.to_categorical(Y, num_classes=3)  

### Using various classification models and targetting 'Category'

In [14]:
# NB_Model = MultinomialNB()
RFC_Model = RandomForestClassifier()
# SVC_Model = SVC(probability=True)
# KNC_Model = KNeighborsClassifier()
# DTC_Model = DecisionTreeClassifier()

### Checking the accuracy using 90/10 train/test split

In [15]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_one_hot, test_size=0.2, random_state=42)

In [16]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
RFCtest = RFC_Model.fit(X_train_vectorized, Y_train)

# RFCtest = RandomForestClassifier().fit(X_train,Y_train)
acc_rfc = RFCtest.score(X_test_vectorized, Y_test)
print('The Random Forest Algorithm has an accuracy of', acc_rfc*100)
rf_predictions = RFC_Model.predict(X_test_vectorized)
# 93

The Random Forest Algorithm has an accuracy of 81.81818181818183


### Entering text to predict the category

In [17]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

2023-11-17 18:04:45.491847: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.


<h4>Build Model</h4>

In [18]:
# Input layer for text
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Preprocess text using BERT preprocessing layer
preprocessed_text = bert_preprocess(text_input)

# Encode processed text using BERT encoder
outputs = bert_encoder(preprocessed_text)

# Construct the neural network layers for classification
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
output_layer = tf.keras.layers.Dense(3, activation='softmax', name="output")(l)  # Output layer for 3 classes

# Build and compile the model
model = tf.keras.Model(inputs=[text_input], outputs=[output_layer])

In [19]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])  # Using accuracy as a metric

In [21]:
model.fit(X_train, Y_train, epochs=5, batch_size=32)  # You might need to adjust epochs and batch_size

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fb08c241150>

In [22]:
# Evaluate the model
results = model.evaluate(X_test, Y_test)
print("Test accuracy:", results[1] * 100)  # results[1] contains accuracy

Test accuracy: 78.65612506866455


In [29]:
import numpy as np
bert_predictions = model.predict(X_test)
y_pred_classes = np.argmax(bert_predictions, axis=1)
Y_test_classes = np.argmax(Y_test, axis=1)
accuracy = accuracy_score(Y_test_classes, y_pred_classes)
print(f'Accuracy: {accuracy * 100}%')
# print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 78.65612648221344%


In [42]:

# Feature engineering: Add additional features (word count, average word length, etc.)

data['Word_Count'] = data['Text'].apply(lambda x: len(str(x).split()))
data['Avg_Word_Length'] = data['Text'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))


X_test_extra_features = data.loc[X_test.index, ['Word_Count', 'Avg_Word_Length']].values

# Reconverting one-hot encoded Y_test back to 1D array of labels
Y_test_labels = np.argmax(Y_test, axis=1)

# Apply stacking ensemble using RFC and GLM predictions along with additional features
final = np.hstack((rf_predictions, bert_predictions, X_test_extra_features))  


from sklearn.linear_model import LogisticRegression
meta_classifier = LogisticRegression(max_iter=10000)  
# meta_classifier = LogisticRegression(solver='liblinear', max_iter=1000)


meta_classifier.fit(final, Y_test_labels)  

ensemble_predictions = meta_classifier.predict(final)

ensemble_accuracy = accuracy_score(Y_test_labels, ensemble_predictions)
print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100}%')
# print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100:.2f}%')

Ensemble Model Accuracy: 94.0711462450593%
