In [1]:
pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 KB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.3-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=4397678 sha256=22a10eb8d11254bee7c28572865c7257082b5e86f59b91a8cfee3e9b5bfdcd35
  Stored in directory: /root/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.3


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
import fasttext

<b>Dataset

In [None]:
# Read the Dataset
dataset = pd.read_csv('spamMassages.csv')

# Display the Data
dataset.head(2)

In [None]:
# Display the Shape of the Dataset
print('The Shape of the Dataset is : {}'.format(dataset.shape))

In [None]:
# Find out the Missing Values in the Dataset
nan_values = dataset.isnull().sum()
print(nan_values)

In [None]:
# Find out the Duplicated Data from Dataset
dupData = dataset.duplicated().sum()
print('Total Duplicated Row in the Dataset is : {}'.format(dupData))

In [None]:
# Drop the Duplicated Data from Dataset
dataset.drop_duplicates(inplace = True)

# Display the shape After drop the Duplicated Data
print('Shape of the Dataset is : {}'.format(dataset.shape))

In [None]:
# Count the each class label
classLabel = dataset['Category'].value_counts()
print(classLabel)

<b>Text Preprocessing

In [9]:
# Load the Spacy Pre-Trained Model
nlp = spacy.load('en_core_web_sm')

In [10]:
def textPreprocessing(text):
    doc = nlp(text)
    filterTokens = []
    
    for token in doc:
        if token.like_num or token.is_stop or token.is_punct:
            continue
        filterTokens.append(token.lemma_.lower())
    return (' '.join(filterTokens))

In [11]:
# Now map the textPreprocessing funtion on Message Column
dataset['cleanMessage'] = dataset['Message'].map(textPreprocessing)

In [12]:
def labelPreprocessing(text):
    return ('__label__' + str(text))

In [13]:
# Now map the labelPreprocessing funtion on Category Column
dataset['labelCategory'] = dataset['Category'].map(labelPreprocessing)

In [14]:
# Now Join the labelCategory and cleanMessage
dataset['cleanData'] = dataset['labelCategory'] + ' ' + dataset['cleanMessage']

<b>Split the Data into Training & Testing

In [15]:
# Split the Data into Training & Testing
train , test = train_test_split(dataset , test_size = 0.25)

# Display the Shape of the Train & Test Data
print('The Shape of the Train is : {}'.format(train.shape))
print('The Shape of the Test  is : {}'.format(test.shape))

The Shape of the Train is : (3867, 5)
The Shape of the Test  is : (1290, 5)


<b>Implementation FastText Model

In [16]:
train.to_csv('trainMessage.txt' ,  columns = ['cleanData'] , header = None , index = False)
test.to_csv('testMessage.txt'   ,  columns = ['cleanData'] , header = None , index = False)

In [17]:
# Now Train the FastText Model
model = fasttext.train_supervised('trainMessage.txt')
model.test('testMessage.txt')

(1281, 0.9804839968774395, 0.9804839968774395)

<b>Let's find out some prediction Results

In [18]:
model.predict('sure night menu know noon menu')

(('__label__ham',), array([0.99678051]))

In [19]:
model.predict('look building coat want sick hurry home wear coat gym')

(('__label__ham',), array([0.99227649]))

In [20]:
model.predict('lose £ help')

(('__label__spam',), array([0.99570006]))

In [21]:
model.predict('today voda number end select receive £ reward match quote claim code standard rate apply')

(('__label__spam',), array([0.9876157]))

<b>Fasttext Important Function

In [None]:
# find out Similar Words
model.get_nearest_neighbors('receive')

In [None]:
# Get the Vector of Spacific word
receiveVec = model.get_word_vector('receive')
print(receiveVec)

In [24]:
# Display the Shape of the Vector
print('The shape of the Word Vector is : {}'.format(receiveVec.shape))

The shape of the Word Vector is : (100,)


In [None]:
# Find the Word base on 2 Word Relation
model.get_analogies('massage' , 'human' , 'call')