In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [2]:
def read_file(file_name):
    messages = pd.read_csv(file_name, sep='\t', names=['label', 'message'])
    return messages


In [3]:
def pre_process(messages):
    ps = PorterStemmer()
    lem = WordNetLemmatizer()
    sw = stopwords.words('english')

    # list of strings( messages )
    corpus = []

    for i in range(len(messages)):
        # cleaning the message
        curr_message = messages['message'][i].lower()
        curr_message = re.sub('[^a-zA-Z]', ' ', curr_message)
        words = curr_message.split(' ')

        # removing stopwords
        words = [word for word in words if (word not in sw)]

        # lemmitization / stemming
        # words = [lem.lemmatize(word) for word in words]
        words = [ps.stem(word) for word in words]

        final_message = ' '.join(words)
        corpus.append(final_message)

    return corpus

In [4]:
def create_bag_of_words(corpus, messages):
    cv = CountVectorizer(max_features=5000)
    x = cv.fit_transform(corpus).toarray()

    y = pd.get_dummies(messages['label'])
    y = y.iloc[:, 1].values

    return (x, y)

In [5]:
def train_model(x_train, x_test, y_train, y_test):
    mnb = MultinomialNB()
    final_model = mnb.fit( x_train, y_train )
    return final_model

In [6]:
def main():
    file_name = 'data'
    messages = read_file(file_name)
    corpus = pre_process(messages)
    x, y = create_bag_of_words(corpus, messages)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0 )
    final_model = train_model(x_train, x_test, y_train, y_test)
    pred =  final_model.predict( x_test )
    cm = confusion_matrix( y_test, pred )
    print( cm )
    print( classification_report( y_test, pred))

In [7]:
main()

[[946   9]
 [  8 152]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.94      0.95      0.95       160

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115

