In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,ConfusionMatrixDisplay,precision_score, recall_score, f1_score, roc_auc_score,roc_curve

In [2]:
messages=pd.read_csv("SMSSpamCollection.txt",sep='\t',names=['labels','message'])

In [3]:
y=pd.get_dummies(messages['labels'],dtype=int)
y=y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0])

In [4]:
corpus=[]
wnl=WordNetLemmatizer()
for i in range(len(messages)):
    review=re.sub('a-zA-Z',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wnl.lemmatize(word,pos='v') for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(corpus,y,random_state=42,test_size=0.3)

In [6]:
tfidf=TfidfVectorizer(max_features=2500,ngram_range=(1,2))

x_train=tfidf.fit_transform(x_train).toarray()
x_test=tfidf.transform(x_test).toarray()

In [7]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((3900, 2500), (1672, 2500), (3900,), (1672,))

In [8]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Random forest classifier":RandomForestClassifier(),
    "Decision tree":DecisionTreeClassifier(),
    "gradient boost":GradientBoostingClassifier(),
    "Support vector machine":SVC(),
    "knearest neighbours":KNeighborsClassifier(),
#    "Naive bias":naive_bayes()
}   
def report(models):

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(x_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)

        # Training set performance
        model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
        model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
        model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
        model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
#        model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
        model_train_confusion_matrix=confusion_matrix(y_train, y_train_pred)


        # Test set performance
        model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
        model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
        model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
        model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
#        model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc
        model_test_confusion_matrix=confusion_matrix(y_test, y_test_pred)


        print(list(models.keys())[i])
        
        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1))
        
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
#        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
        print('- confusion matrix:\n{}'.format(model_train_confusion_matrix))

        
        
        print('----------------------------------')
        
        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
#        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        print('- confusion matrix:\n{}'.format(model_test_confusion_matrix))
        
        print('='*35)
        print('\n')


In [9]:
report(models)

Logisitic Regression
Model performance for Training set
- Accuracy: 0.9767
- F1 score: 0.9758
- Precision: 0.9954
- Recall: 0.8298
- confusion matrix:
[[3375    2]
 [  89  434]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9737
- F1 score: 0.9725
- Precision: 0.9945
- Recall: 0.8080
- confusion matrix:
[[1447    1]
 [  43  181]]


Random forest classifier
Model performance for Training set
- Accuracy: 0.9992
- F1 score: 0.9992
- Precision: 1.0000
- Recall: 0.9943
- confusion matrix:
[[3377    0]
 [   3  520]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9815
- F1 score: 0.9809
- Precision: 1.0000
- Recall: 0.8616
- confusion matrix:
[[1448    0]
 [  31  193]]


Decision tree
Model performance for Training set
- Accuracy: 0.9992
- F1 score: 0.9992
- Precision: 1.0000
- Recall: 0.9943
- confusion matrix:
[[3377    0]
 [   3  520]]
----------------------------------
Model performance for Test set
- Accuracy: 0.9653
- F1