In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.utils import to_categorical



In [None]:
# Adjust paths as necessary
original_data = pd.read_csv('/content/preprocessed_twitter_data_small .csv')
nemotron_data = pd.read_csv('/content/synthetic_data_nemotron.csv')
mixtral_data = pd.read_csv('/content/synthetic_data_mixtral.csv')
llama_data = pd.read_csv('/content/synthetic_data_llama.csv')


In [None]:
# Configurations of different datasets
data_configs = {
    "Original": original_data,
    "Original + Nemotron": pd.concat([original_data, nemotron_data], ignore_index=True),
    "Original + Mixtral": pd.concat([original_data, mixtral_data], ignore_index=True),
    "Original + Llama": pd.concat([original_data, llama_data], ignore_index=True),
    "All Combined": pd.concat([original_data, nemotron_data, mixtral_data, llama_data], ignore_index=True)
}


In [None]:
def concat_synthetic_data(original, synthetic):
    # Ensure the columns match and concatenate
    synthetic_combined = pd.concat([original, synthetic[['synthetic_text', 'sentiment']]], ignore_index=True)
    return synthetic_combined

# Configurations of different dataset combinations
data_configs = {
    "Original": original_data,
    "Original + Nemotron": concat_synthetic_data(original_data, nemotron_data),
    "Original + Mixtral": concat_synthetic_data(original_data, mixtral_data),
    "Original + Llama": concat_synthetic_data(original_data, llama_data),
    "All Combined": pd.concat([
        original_data,
        nemotron_data[['synthetic_text', 'sentiment']],
        mixtral_data[['synthetic_text', 'sentiment']],
        llama_data[['synthetic_text', 'sentiment']]
    ], ignore_index=True)
}


In [None]:
def split_data(data):
    # Using 'text' as the input feature and 'sentiment' as the target
    X = data['text']
    y = data['sentiment']

    # Convert y to categorical for multi-class classification
    y = pd.Categorical(y).codes  # Convert sentiment labels to integer codes
    y = to_categorical(y, num_classes=4)  # One-hot encode for 4 classes

    return train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(original_data['text'])

def preprocess_text(X_train, X_test):
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    max_length = max(len(seq) for seq in X_train_seq)  # Dynamic padding
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
    return X_train_pad, X_test_pad


In [None]:
def create_model(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.5),
        Dense(64, activation='sigmoid'),
        Dropout(0.5),
        Dense(4, activation='softmax')  # Use 'softmax' for multi-class sentiment classification
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
results = {}

for config_name, data in data_configs.items():
    # Split data
    X_train, X_test, y_train, y_test = split_data(data)
    # Preprocess text
    X_train_pad, X_test_pad = preprocess_text(X_train, X_test)

    # Create and train the model
    model = create_model(X_train_pad.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train_pad, y_train, epochs=10, validation_split=0.2, batch_size=32, callbacks=[early_stopping], verbose=1)

    # Evaluate the model
    y_pred = np.argmax(model.predict(X_test_pad), axis=1)
    y_true = np.argmax(y_test, axis=1)  # Convert one-hot to label indices

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')  # Weighted F1 for multi-class

    # Save results
    results[config_name] = {'Accuracy': acc, 'F1 Score': f1}
    print(f"{config_name} - Accuracy: {acc}, F1 Score: {f1}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 218ms/step - accuracy: 0.2125 - loss: 1.8652 - val_accuracy: 0.1875 - val_loss: 1.5255
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.2010 - loss: 1.9308 - val_accuracy: 0.2188 - val_loss: 1.5534
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2490 - loss: 1.8210 - val_accuracy: 0.2188 - val_loss: 1.5804
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2635 - loss: 1.8250 - val_accuracy: 0.1875 - val_loss: 1.5457
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step
Original - Accuracy: 0.2, F1 Score: 0.20504532652701987


AttributeError: 'float' object has no attribute 'lower'

In [None]:
# Mount Google Drive (if using Google Drive)
from google.colab import drive
#drive.mount('/content/drive')

# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical  # Import to_categorical
import tensorflow as tf

# Load the Datasets
# Adjust paths as necessary
original_data = pd.read_csv('/content/preprocessed_twitter_data_small .csv')
nemotron_data = pd.read_csv('/content/synthetic_data_nemotron.csv')
mixtral_data = pd.read_csv('/content/synthetic_data_mixtral.csv')
llama_data = pd.read_csv('/content/synthetic_data_llama.csv')

# Combine Synthetic Data with Original Data
def concat_synthetic_data(original, synthetic):
    # Ensure the columns match and concatenate
    synthetic_combined = pd.concat([original, synthetic[['synthetic_text', 'sentiment']]], ignore_index=True)
    return synthetic_combined

# Configurations of different dataset combinations
data_configs = {
    "Original": original_data,
    "Original + Nemotron": concat_synthetic_data(original_data, nemotron_data),
    "Original + Mixtral": concat_synthetic_data(original_data, mixtral_data),
    "Original + Llama": concat_synthetic_data(original_data, llama_data),
    "All Combined": pd.concat([
        original_data,
        nemotron_data[['synthetic_text', 'sentiment']],
        mixtral_data[['synthetic_text', 'sentiment']],
        llama_data[['synthetic_text', 'sentiment']]
    ], ignore_index=True)
}

# Define Train-Test Split Function
def split_data(data):
    # Using 'text' as the input feature and 'sentiment' as the target
    X = data['text'].fillna('')  # Fill NaN values in 'text' with an empty string
    y = data['sentiment']

    # Convert y to categorical for multi-class classification
    y = pd.Categorical(y).codes  # Convert sentiment labels to integer codes
    y = to_categorical(y, num_classes=4)  # One-hot encode for 4 classes

    return train_test_split(X, y, test_size=0.2, random_state=42)

# Text Preprocessing (Tokenization)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(original_data['text'].fillna(''))  # Fit tokenizer on non-missing text data

def preprocess_text(X_train, X_test):
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    max_length = max(len(seq) for seq in X_train_seq)  # Dynamic padding
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
    return X_train_pad, X_test_pad

# Define Model Architecture
def create_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(4, activation='softmax')  # Output layer for 4-class classification
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate on Each Dataset Configuration
results = {}

for config_name, data in data_configs.items():
    # Split data
    X_train, X_test, y_train, y_test = split_data(data)
    # Preprocess text
    X_train_pad, X_test_pad = preprocess_text(X_train, X_test)

    # Create and train the model
    model = create_model(X_train_pad.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train_pad, y_train, epochs=15, validation_split=0.2, batch_size=32, callbacks=[early_stopping], verbose=1)

    # Evaluate the model
    y_pred = np.argmax(model.predict(X_test_pad), axis=1)
    y_true = np.argmax(y_test, axis=1)  # Convert one-hot to label indices

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')  # Weighted F1 for multi-class

    # Save results
    results[config_name] = {'Accuracy': acc, 'F1 Score': f1}
    print(f"{config_name} - Accuracy: {acc}, F1 Score: {f1}")

# Display Results
# Displaying all the results in a DataFrame for easier comparison
results_df = pd.DataFrame(results).T
print(results_df)


Epoch 1/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209ms/step - accuracy: 0.2646 - loss: 179.6979 - val_accuracy: 0.3125 - val_loss: 85.0058
Epoch 2/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1677 - loss: 173.9027 - val_accuracy: 0.3125 - val_loss: 81.2959
Epoch 3/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2948 - loss: 136.1296 - val_accuracy: 0.3125 - val_loss: 77.4042
Epoch 4/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2073 - loss: 126.4144 - val_accuracy: 0.3125 - val_loss: 70.7206
Epoch 5/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2917 - loss: 97.8422 - val_accuracy: 0.3125 - val_loss: 60.9189
Epoch 6/15
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3656 - loss: 90.2444 - val_accuracy: 0.3125 - val_loss: 53.6192
Epoch 7/15
[1m4/4[0m [32m━