In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
import json

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)

# Use only a small portion of the dataset for demonstration

# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Save the tokenizer here
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the model
embedding_dim = 16
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)

In [None]:
model.save('/content/drive/MyDrive/FINAL/pichi/CNN_MODEL_OPTIMIZED.h5')

In [None]:
# FNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)


# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the Feedforward Neural Network (FNN) model
embedding_dim = 16
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)


In [None]:
model.save('/content/drive/MyDrive/FINAL/pichi/FNN_MODEL_OPTIMISED.h5')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)



# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the LSTM model
embedding_dim = 16
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_lstm.add(Bidirectional(LSTM(64)))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model_lstm.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)


In [None]:
x=df['target']
x
z=0
for i in x:
    if i==1:
        z=z+1

z

In [None]:
model_lstm.save('/content/drive/MyDrive/FINAL/pichi/LSTM_MODEL_OPTIMISED.h5')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)



# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000 # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the GRU model
embedding_dim = 16
model_gru = Sequential()
model_gru.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_gru.add(GRU(64))
model_gru.add(Dense(1, activation='sigmoid'))

# Compile the model
model_gru.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model_gru.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fbe1815b910>

In [None]:
model_gru.save('/content/drive/MyDrive/FINAL/pichi/GRU_MODEL_OPTIMISED.h5')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)


# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the RNN model
embedding_dim = 16
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(1, activation='sigmoid'))

# Compile the model
model_rnn.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model_rnn.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fbe10186830>

In [None]:
model_rnn.save('/content/drive/MyDrive/FINAL/RNN_MODEL_OPTIMISED.h5')

  saving_api.save_model(


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)



# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the Bidirectional RNN model
embedding_dim = 16
model_bidirectional_rnn = Sequential()
model_bidirectional_rnn.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model_bidirectional_rnn.add(Bidirectional(SimpleRNN(64)))
model_bidirectional_rnn.add(Dense(1, activation='sigmoid'))

# Compile the model
model_bidirectional_rnn.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model_bidirectional_rnn.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'

In [None]:
model_bidirectional_rnn.save('/content/drive/MyDrive/FINAL/BI_RNN_MODEL_OPTIMISED.h5')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam

# Load the preprocessed dataset
file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
df = pd.read_csv(file_path)

# Convert the 'text' column to strings
df['text'] = df['text'].astype(str)

# Use only a small portion of the dataset for demonstration
df = shuffle(df)

# Encode the target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# Tokenize the texts
max_words = 1000000  # Adjust based on your dataset size
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to a fixed length
max_sequence_length = 50  # Adjust based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Build the model
embedding_dim = 16
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32  # Adjust based on your resources
epochs = 10  # Adjust based on your requirements
model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), batch_size=batch_size, epochs=epochs)
model.save('/content/drive/MyDrive/FINAL/CNN_POWERFUL.h5')
