<a href="https://colab.research.google.com/github/ColleyMo/AI-projects/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
import joblib

# Load the dataset
data = pd.read_csv('dataset.csv')

# Separate features (text messages) and labels (spam/ham)
X = data['text']
y = data['text_type']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
max_length = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Build CNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test Accuracy:", accuracy)

# Save the trained model and tokenizer
model.save('cnn_spam_classifier.h5')
joblib.dump(tokenizer, 'tokenizer.pkl')

# Function to classify new text
def classify_text(input_text):
    # Load the trained tokenizer
    tokenizer = joblib.load('tokenizer.pkl')

    # Tokenize and pad the input text
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Load the trained model
    model = load_model('cnn_spam_classifier.h5')

    # Predict label for the input text
    prediction = model.predict(input_padded)

    if prediction[0] > 0.5:
        return 'spam'
    else:
        return 'ham'


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.9570024609565735


  saving_api.save_model(


In [21]:
# Example usage
input_text = input("Enter a text message: ")
result = classify_text(input_text)
print("Predicted label:", result)

Enter a text message: Your flight booking from New York to Los Angeles has been confirmed. Check your email for the e-ticket details
Predicted label: spam
