<a href="https://colab.research.google.com/github/Amadi-99/CNN_SMS_Classification/blob/main/CNN_SMS_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SMS Message Classification Using CNN
Convolutional Neural Network(CNN) is used for developed sms message classification model

---



In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score

# Step 1: Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/Amadi-99/smsDataSet/main/DataSet.csv', index_col=0, encoding='latin-1')

# Step 2: Preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', 'number', text)  # Replace digits with 'number'
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

df['Text'] = df['Text'].apply(preprocess_text)

# Step 3: Label Encoding
label_encoder = LabelEncoder()
df['Tags_encoded'] = label_encoder.fit_transform(df['Tags'])
num_classes = len(label_encoder.classes_)

# Step 4: Train-Validation-Test Split
X_train, X_val_test, y_train, y_val_test = train_test_split(df['Text'], df['Tags_encoded'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Step 5: Tokenization
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Step 6: Sequence Padding
max_sequence_length = 100
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_val = pad_sequences(X_val, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# Step 7: Handling Class Imbalance
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

# Step 8: Model Creation and Training
def create_model():
    embedding_dim = 100
    filters = 128
    kernel_size = 3
    hidden_dims = 64
    dropout_rate = 0.5

    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_dims, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Step 9: Model Training
batch_size = 64
epochs = 10

# Create a wrapper for the Keras model
estimator = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=batch_size, verbose=0)


# Perform cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)


# Print the cross-validation results
print("Cross-Validation Accuracy: %.2f%% (+/- %.2f%%)" % (results.mean() * 100, results.std() * 100))

# Fit the model
model = create_model()
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Evaluate the model on the validation data
_, accuracy = model.evaluate(X_val, y_val)

# New text for prediction
new_text = "<#> Shadowfax Id is 152870765 6gW4yAjEoWG"

# Preprocess the new text
preprocessed_text = preprocess_text(new_text)
encoded_text = tokenizer.texts_to_sequences([preprocessed_text])
padded_text = pad_sequences(encoded_text, maxlen=max_sequence_length)

# Get the predicted category
predicted_category = model.predict(padded_text)
predicted_category = label_encoder.inverse_transform(np.argmax(predicted_category, axis=-1))

# Print the predicted category
print("Predicted Category:", predicted_category)

# Obtain the predicted labels for the test set
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)

# Obtain the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100
print("Accuracy: {:.2f}%".format(accuracy_percentage))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
precision_percentage = precision * 100
print("Precision: {:.2f}%".format(precision_percentage))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
recall_percentage = recall * 100
print("Recall: {:.2f}%".format(recall_percentage))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f1_percentage = f1 * 100
print("F1 Score: {:.2f}%".format(f1_percentage))

# Calculate Jaccard score
jaccard = jaccard_score(y_test, y_pred, average='weighted')
jaccard_percentage = jaccard * 100
print("Jaccard Score: {:.2f}%".format(jaccard_percentage))



  estimator = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=batch_size, verbose=0)


Cross-Validation Accuracy: 96.67% (+/- 0.22%)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predicted Category: ["['authentication','transactional']"]
Confusion Matrix:
[[ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  6  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  4  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  0  0 11  1  0  0  0  1  0  5]
 [ 0  0  0  0  0  0  0  0  2 12  0  0  2  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 88  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  3  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  1  1  0  0  4 20  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  4  0  0 12  0]
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
