<a href="https://colab.research.google.com/github/Amadi-99/SMS_Classification_ML_Models/blob/main/SMS_Classification_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine Learning Classifiers**

**Classifiers for multilable classification**

*  LinearSVC (Support Vector Classifier)

*  SGDClassifier (Stochastic Gradient Descent Classifier)

*  LogisticRegression

*  MultinomialNB (Multinomial Naive Bayes)


In [3]:
import pandas as pd
import numpy as np
import ast
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

warnings.filterwarnings("ignore")

# Read the dataset from the provided URL
df = pd.read_csv('https://raw.githubusercontent.com/Amadi-99/smsDataSet/main/smsData.csv',sep=';', index_col=0, encoding='latin-1')

# Drop rows with missing values
df.dropna(inplace=True)

# Preprocess the 'Text' column
df['Text'] = df['Text'].str.strip()  # Remove leading and trailing whitespaces
df['Text'] = df['Text'].str.replace('[^\w\s]', '', regex=True).str.lower()  # Remove punctuation and convert to lowercase
df['Text'] = df['Text'].str.split()  # Tokenize text into individual words

# Define stopwords to be removed
stopwords = set(['a', 'an', 'the', 'and', 'or', 'if', 'of', 'to', 'in', 'is', 'you', 'that', 'it', 'for', 'with', 'on', 'was', 'as', 'at', 'this', 'my', 'be', 'by', 'not', 'from', 'are', 'have', 'your', 'they', 'which', 'we', 'but', 'their', 'can', 'all', 'he', 'she', 'there', 'been', 'what', 'do', 'so', 'out', 'up', 'just', 'about', 'me', 'him', 'her', 'his', 'hers', 'something', 'more', 'some', 'how', 'has', 'would', 'could', 'should', 'did', 'were', 'its', 'than', 'been'])
df['Text'] = df['Text'].apply(lambda words: [word for word in words if word not in stopwords])  # Remove stopwords
df['Text'] = df['Text'].apply(lambda words: ' '.join(words))  # Join the processed words back into sentences

# Convert the 'Tags' column from string representation to list
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))

# Transform the multi-label tags into binary labels
y = df['Tags']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)

# Apply TF-IDF vectorization on the 'Text' column
tfidf = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,3), stop_words='english')
X = tfidf.fit_transform(df['Text'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the classifiers to be used
classifiers = [
    SGDClassifier(),
    LogisticRegression(solver='lbfgs'),
    LinearSVC(),
    MultinomialNB()
]

# Define a Jaccard score function to evaluate the performance
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean() * 100

# Define a function to print the scores and confusion matrix
def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    jaccard_score = j_score(y_test, y_pred)

    # Print the Jaccard score as a percentage
    print('Jaccard score/Index: {:.2f}%'.format(jaccard_score))


    # Compute the confusion matrix
    conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

    # Print the confusion matrix
    print("Confusion Matrix:")
    print(conf_matrix)

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')

    # Print the performance metrics
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    print("Precision: {:.2f}%".format(precision * 100))
    print("Recall: {:.2f}%".format(recall * 100))
    print("F1 Score: {:.2f}%".format(f1 * 100))
    print('----')

# Perform cross-validation and evaluate each model
for classifier in classifiers:
    clf = OneVsRestClassifier(classifier)
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Classifier: ", clf.__class__.__name__)
    print("Cross-Validation Accuracy:  %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100))
    clf.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

    # Example input for prediction
    x = ['<#> Shadowfax Id is 152870765 6gW4yAjEoWG']

    # Transform the input using the same TF-IDF vectorizer
    xt = tfidf.transform(x)

    # Make predictions on the input text
    predicted_labels = multilabel.inverse_transform(clf.predict(xt))

    # Print the predicted labels
    print(predicted_labels)
    print('----')

Classifier:  OneVsRestClassifier
Cross-Validation Accuracy:  41.00% (+/- 17.37%)
Clf:  SGDClassifier
Jaccard score/Index: 80.00%
Confusion Matrix:
[[ 4  0  0  0]
 [ 0 10  0  0]
 [ 1  0  0  0]
 [ 2  0  0  3]]
Accuracy: 80.00%
Precision: 100.00%
Recall: 76.19%
F1 Score: 86.49%
----
[('otp',)]
----
Classifier:  OneVsRestClassifier
Cross-Validation Accuracy:  1.25% (+/- 2.50%)
Clf:  LogisticRegression
Jaccard score/Index: 15.00%
Confusion Matrix:
[[4 0 0 0]
 [7 3 0 0]
 [1 0 0 0]
 [5 0 0 0]]
Accuracy: 15.00%
Precision: 100.00%
Recall: 14.29%
F1 Score: 25.00%
----
[()]
----
Classifier:  OneVsRestClassifier
Cross-Validation Accuracy:  15.75% (+/- 7.89%)
Clf:  LinearSVC
Jaccard score/Index: 70.00%
Confusion Matrix:
[[ 4  0  0  0]
 [ 0 10  0  0]
 [ 1  0  0  0]
 [ 3  0  0  2]]
Accuracy: 70.00%
Precision: 100.00%
Recall: 66.67%
F1 Score: 80.00%
----
[('otp',)]
----
Classifier:  OneVsRestClassifier
Cross-Validation Accuracy:  11.83% (+/- 4.96%)
Clf:  MultinomialNB
Jaccard score/Index: 50.00%
Confu