<a href="https://colab.research.google.com/github/alihussainia/AI-Makerspace/blob/test/pycaret-nlp/email_spam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# installing packages

In [None]:
!pip install --upgrade pycaret

In [None]:
!python -m spacy download en_core_web_sm
!python -m textblob.download_corpora

# importing pacakges

In [None]:
import pandas as pd
from pycaret.nlp import *
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


# preparing dataset

In [None]:
# getting dataset 
dataset = pd.read_csv('messages.csv').reset_index(drop=True)

In [None]:
dataset.isna().sum()

subject    62
message     0
label       0
dtype: int64

In [None]:
dataset.fillna(dataset['subject'].mode().values[0],inplace=True)

In [None]:
# converting message and subject to lowercase
dataset['message'] = dataset['message'].str.lower()
dataset['subject'] = dataset['subject'].str.lower()

In [None]:
# creating a copy of the dataset
df = dataset.copy()

In [None]:
# combining subject and message
df['sub_mssg']=df['subject']+df['message']

In [None]:
# now we don't need subject and message anymore due to sub_mssg
df.drop('subject',axis=1,inplace=True)
df.drop('message',axis=1,inplace=True)

# setting pycaret

In [None]:
exp_1 = setup(data = df, target = 'sub_mssg', session_id = 123, log_experiment = True, experiment_name = 'email_1')

Description,Value
session_id,123
Documents,2893
Vocab Size,24656
Custom Stopwords,False


# creating models

- List of available models:
  * 'lda' - Latent Dirichlet Allocation         
  * 'lsi' - Latent Semantic Indexing           
  * 'hdp' - Hierarchical Dirichlet Process
  * 'rp' - Random Projections
  * 'nmf' - Non-Negative Matrix Factorization

In [None]:
lda = create_model('lda', num_topics = 6, multi_core = True)

In [None]:
plot_model(lda, plot = 'topic_distribution')

In [None]:
# custom function for most repetitive topic words
def topics():
  topic_list=lda.show_topics() # returns topics as list
  lst=[]

  for i in range(len(topic_list)):
    lst.extend(topic_list[i][1].split(sep='" + '))

  for i in range(len(lst)):  
    lst[i] = lst[i][7:]

  return list(set([x for x in lst if lst.count(x) > 1]))

In [None]:
# setting topics as custom stopwords
stop=topics()

In [None]:
exp_2 = setup(data = df, target = 'sub_mssg', session_id = 123, log_experiment = True, experiment_name = 'email_2_cs', custom_stopwords=stop)

Description,Value
session_id,123
Documents,2893
Vocab Size,24539
Custom Stopwords,True


In [None]:
lda2 = create_model('lda', num_topics = 6, multi_core = True)

In [None]:
plot_model(lda2, plot = 'topic_distribution')

In [None]:
plot_model(lda2, plot = 'frequency', topic_num = 'Topic 1')

In [None]:
tuned_classification = tune_model(model = 'lda', multi_core = True, supervised_target = 'label')

IntProgress(value=0, description='Processing: ', max=25)

Output()

Best Model: Latent Dirichlet Allocation | # Topics: 4 | Accuracy : 0.9761


In [None]:
save_model(tuned_classification,'tuned_classification_Model_V1')