<a href="https://colab.research.google.com/github/Ayobamijeje/curenetics_test/blob/model_training_branch/currenetics_trainedmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Curenetic Sentiment Analysis


#### Load data
#### Data cleaning and visualisation
#### Generate data for data augmentationn using transformers and nltk
#### Model pipeline for ML text transformation
#### Train Modoel - DL and Logistic regression
#### Save best model  

In [None]:
## Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
data  = pd.read_csv('/content/feedback_data.csv')


In [None]:
data.head(10)

In [None]:
# 2 columns, 51 row, string data, no null

data.info()


In [None]:
#Neg 27, Pos 24
data['Sentiment'].value_counts().plot(kind = 'barh').invert_yaxis()
plt.title('Sentiment review')

In [None]:
# No of words - 520
lst_text = data['Feedback'].str.cat(sep=',').split(' ')
print(len(lst_text))


In [None]:
# stopwords are encompase majority of the text
pd.Series(lst_text).value_counts()[:10].plot(kind = 'bar')
plt.title('First 10 words')

#### Text Augmentation


In [None]:
import nltk
import random
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to get synonyms
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            name = lemma.name().replace("_", " ").lower()
            if name != word:
                synonyms.add(name)
    return list(synonyms)

# Basic synonym replacement
def augment_sentence(sentence, n=1):
    words = sentence.split()
    new_sentences = []
    for _ in range(n):
        new_words = words.copy()
        for i, word in enumerate(new_words):
            syns = get_synonyms(word)
            if syns:
                new_words[i] = random.choice(syns)
        new_sentences.append(' '.join(new_words))
    return ' '.join(new_sentences)


In [None]:
exp1 = data['Feedback'][:10].apply(lambda x : augment_sentence(x))
print(exp1)
print('------------------')
print(data['Feedback'][:10])

In [None]:
from transformers import pipeline

paraphraser = pipeline(
    "text2text-generation",
    model="Vamsi/T5_Paraphrase_Paws",
    tokenizer="Vamsi/T5_Paraphrase_Paws"
)

def augment_sentence02(text, num_return_sequences=1):
    prompt = f"paraphrase: {text} </s>"
    results = paraphraser(
        prompt,
        max_length=30,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=100,
        top_p=0.90
    )
    return ' '.join([r['generated_text'] for r in results])


In [None]:
exp2 = data['Feedback'][:10].apply(lambda x : augment_sentence02(x))
print(exp2)
print('------------------')
print(data['Feedback'][:10])


In [None]:
data['aug_sentence1'] = data['Feedback'].apply(lambda x : (augment_sentence(x)))

data['aug_sentence2'] = data['Feedback'].apply(lambda x : (augment_sentence02(x)))


data


In [None]:
data01 = data[['aug_sentence1', 'Sentiment']]
data02 = data[['aug_sentence2', 'Sentiment']]

data01.rename(columns = {'aug_sentence1':'Feedback'}, inplace = True)
data02.rename(columns = {'aug_sentence2':'Feedback'}, inplace = True)



combine_data = pd.concat([data[['Feedback', 'Sentiment']], data01[['Feedback', 'Sentiment']]], ignore_index = True, axis = 0)
combine_data = pd.concat([combine_data[['Feedback', 'Sentiment']], data02[['Feedback', 'Sentiment']]], ignore_index = True, axis = 0)



In [None]:
combine_data
combine_data.info()

In [None]:
nltk.download('stopwords')

stopwords = stopwords.words('english')

In [None]:

# Creating a function to remove stopwords
def remove_stopwords(text):
    s = []
    text = text.lower()
    for words in text.split():
        if words not in stopwords:
            s.append(words)
    a = s[:]
    s.clear()
    return ' '.join(a)

In [None]:
combine_data['FB_WSW'] = combine_data['Feedback'].apply(lambda x : remove_stopwords(x))

combine_data

In [None]:
combine_data['FB_WSW'].apply(lambda x : len(x)).max()

In [None]:

# Hint about words


plt.figure(figsize=(10, 30))

for dig, i in enumerate(combine_data['Sentiment'].unique()):
  text_data = " ".join(statement for statement in combine_data.loc[combine_data['Sentiment']== i, 'FB_WSW'])
  wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text_data)
  axes = plt.subplot(4, 2, dig + 1)
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")
  plt.title(i)


plt.tight_layout()
plt.subplots_adjust(hspace = -0.8)

plt.show()

### ML text pipiline - text_vectorization
### tf text_vectorization  create a pipleline to hand texts helps to raw_text → standardized → tokenized → mapped to if_idf → padded


In [None]:
# Sentiment to number
combine_data['label'], names = pd.factorize(combine_data['Sentiment'])# generating label encoder



In [None]:
X = combine_data['FB_WSW']
y = combine_data['label']

# Perform stratified split: 80% for training, 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=3)


# Combine the split data back into DataFrames for easier handling later
train_dataset = pd.DataFrame({'FB_WSW': X_train, 'label': y_train})
val_dataset = pd.DataFrame({'FB_WSW': X_val, 'label': y_val})


In [None]:
train_dataset.to_csv('train_pandas.csv', index=False)


In [None]:
train_dataset_ = tf.data.Dataset.from_tensor_slices((train_dataset['FB_WSW'].values, train_dataset['label'].values))
val_dataset_ = tf.data.Dataset.from_tensor_slices((val_dataset['FB_WSW'].values, val_dataset['label'].values))

train_ds = train_dataset_.shuffle(buffer_size=16).batch(16)
val_ds = val_dataset_.shuffle(buffer_size=16).batch(16)


In [None]:
for i, j in train_dataset_.take(1):## remove
  print(i.shape)

In [None]:
from tensorflow.keras import layers

max_length = 200
max_tokens = 15000
text_vectorization = layers.TextVectorization(
    ngrams=2,
    max_tokens=max_tokens,
    output_mode="tf_idf")
 # text_vectorization apply lower() and removing punctuations from the data

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

In [None]:
train_int = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
val_int = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)


In [None]:


vocab_size = len(text_vectorization.get_vocabulary())
vocab_size

### DL model

In [None]:

def get_model(max_tokens=vocab_size, hidden_dim=3):
     inputs = keras.Input(shape=(vocab_size,))
     x = layers.Dense(hidden_dim, activation="relu")(inputs)
     x = layers.Dropout(0.5)(x)
     outputs = layers.Dense(1, activation="sigmoid")(x)
     model = keras.Model(inputs, outputs)

     model.compile(optimizer="rmsprop",
     loss="binary_crossentropy",
     metrics=["accuracy"])
     return model

In [None]:
model = get_model()
model.summary()

In [None]:
callbacks = [
 keras.callbacks.ModelCheckpoint("/content/Curenetics.keras",
 save_best_only=True)
]

model.fit(train_int,
 validation_data=val_int,
 epochs=40,
 callbacks = callbacks)


In [None]:
loaded_model = tf.keras.models.load_model('/content/Curenetics.keras')


In [None]:
texts = ['it makes no sense at all ', 'i hate Movies with sad ending', 'page loads slow frustrates every time', 'The form submission failed multiple times', 'found website confusing hard use', ' the display is horrible', 'Great site great site love it love it', 'i love you']


for text in texts:
  text = text_vectorization([text])
  predict = loaded_model.predict(text)
  if predict < 0.5:
      print('Positive')
  else:
      print('Negative')

  print(predict)
  #print(index_predict)




In [None]:
unbatched_test_ds = val_int.unbatch()

# Batch all examples at once for analysis
test_inputs, test_labels = [], []

for inputs, labels in unbatched_test_ds:
    test_inputs.append(inputs)
    test_labels.append(labels)

test_inputs = np.array(test_inputs)
test_labels = np.array(test_labels)



In [None]:



prediction = []
predict = loaded_model.predict(test_inputs)
for i in predict:
  if i < 0.5:
      prediction.append(0)
  else:
      prediction.append(1)


print("Accuary: ", accuracy_score(test_labels, prediction))
print(confusion_matrix(test_labels,prediction))



In [None]:
cm = confusion_matrix(test_labels, prediction)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names[np.unique(test_labels)])
disp.plot(cmap='viridis')
plt.xticks(rotation=45, ha='right')
plt.title('Confusion Matrix')
plt.show()

Logistic Regression


### tf dataset converted to np for logistic regression

In [None]:
import numpy as np

def dataset_to_numpy(datasets):
    X, y = [], []
    for batch_x, batch_y in datasets:
        X.append(batch_x.numpy())
        y.append(batch_y.numpy())
    return np.vstack(X), np.concatenate(y)

X_train, y_train = dataset_to_numpy(train_int)
X_val, y_val = dataset_to_numpy(val_int)



In [None]:
from sklearn.linear_model import LogisticRegression

sl_model = LogisticRegression(max_iter=20)
sl_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


prediction = sl_model.predict(X_train)

In [None]:
print("Accuary: ", accuracy_score(y_train, prediction))
print(confusion_matrix(y_train,prediction))



In [None]:
prediction_val = sl_model.predict(X_val)

print("Accuary: ", accuracy_score(y_val, prediction_val))
print(confusion_matrix(y_val, prediction_val))



In [None]:
cm = confusion_matrix(y_val, prediction_val)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names[np.unique(y_val)])
disp.plot(cmap='viridis')
plt.xticks(rotation=45, ha='right')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import tensorflow as tf
import keras
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

## Summary

### Sentiment analysis trained with DL and logistic regress
### Augemented with transformer and nltk
### Logistic regression performed better likely due to very small dataset - DL Val_accuracy -80.3, logistic regression accuracy  - 93
### The DL  shows steady training accuracy around 80-87% and consistent validation accuracy at 77-81%, with gradually decreasing loss. This indicates stable learning and good generalization, though validation accuracy plateaus, suggesting further dataset may be needed to boost performance.
### Model used down stress for application