Importing Libaries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Importing Dataset

In [3]:
sms_data = pd.read_csv('/content/sms_spam.csv')

In [4]:
print(sms_data)

      type                                               text
0      ham  Hope you are having a good week. Just checking in
1      ham                            K..give back my thanks.
2      ham        Am also doing in cbe only. But have to pay.
3     spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4     spam  okmail: Dear Dave this is your final notice to...
...    ...                                                ...
5554   ham  You are a great role model. You are giving so ...
5555   ham  Awesome, I remember the last time we got someb...
5556  spam  If you don't, your prize will go to another cu...
5557  spam  SMS. ac JSco: Energy is high, but u may not kn...
5558   ham                    Shall call now dear having food

[5559 rows x 2 columns]


In [5]:
sms_data.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [6]:
sms_data.isnull().sum()

type    0
text    0
dtype: int64

In [9]:
sms_data.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [10]:
sms_data.isna().sum()

type    0
text    0
dtype: int64

Renaming Columns

In [11]:
sms_data.columns = ['Category', 'Message']

In [12]:
sms_data.head()

Unnamed: 0,Category,Message
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [13]:
sms_data.shape

(5559, 2)

In [14]:
sms_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5559 entries, 0 to 5558
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5559 non-null   object
 1   Message   5559 non-null   object
dtypes: object(2)
memory usage: 87.0+ KB


In [15]:
sms_data['Category'].value_counts()

Category
ham     4812
spam     747
Name: count, dtype: int64

Label Encoding

In [16]:
# Label spam sms as 0 ; ham sms as 1
sms_data.loc[sms_data['Category'] == 'spam', 'Category',] = 0
sms_data.loc[sms_data['Category'] == 'ham', 'Category',] = 1

In [17]:
sms_data.head()

Unnamed: 0,Category,Message
0,1,Hope you are having a good week. Just checking in
1,1,K..give back my thanks.
2,1,Am also doing in cbe only. But have to pay.
3,0,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,0,okmail: Dear Dave this is your final notice to...


In [18]:
#Separating the data as texts and label
X = sms_data['Message']
Y = sms_data['Category']

In [19]:
print(X)

0       Hope you are having a good week. Just checking in
1                                 K..give back my thanks.
2             Am also doing in cbe only. But have to pay.
3       complimentary 4 STAR Ibiza Holiday or £10,000 ...
4       okmail: Dear Dave this is your final notice to...
                              ...                        
5554    You are a great role model. You are giving so ...
5555    Awesome, I remember the last time we got someb...
5556    If you don't, your prize will go to another cu...
5557    SMS. ac JSco: Energy is high, but u may not kn...
5558                      Shall call now dear having food
Name: Message, Length: 5559, dtype: object


In [20]:
print(Y)

0       1
1       1
2       1
3       0
4       0
       ..
5554    1
5555    1
5556    0
5557    0
5558    1
Name: Category, Length: 5559, dtype: object


In [21]:
#Splitting the data into training data and testing data
X_train, X_test,Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

In [22]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5559,)
(4447,)
(1112,)


Feature Extraction

In [23]:
feature_extraction  = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#Convert Y_train and Y_test data as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [24]:
print(X_train_features)

  (0, 4623)	0.823455395308547
  (0, 3331)	0.5673810112589641
  (1, 3903)	0.4357265957313381
  (1, 1451)	0.7681743219414209
  (1, 5601)	0.4690954539133984
  (2, 3206)	0.624094058509506
  (2, 3399)	0.41894076689433823
  (2, 6949)	0.27801616343296237
  (2, 3425)	0.4750464218326553
  (2, 5601)	0.3633636607327767
  (3, 7445)	0.18819706279852946
  (3, 4606)	0.10279369639694426
  (3, 5403)	0.161119468392088
  (3, 1778)	0.1732151718174184
  (3, 3020)	0.12291269372081944
  (3, 6918)	0.1556874829897335
  (3, 2632)	0.3289026548071519
  (3, 4621)	0.15823328083630736
  (3, 2465)	0.18819706279852946
  (3, 4277)	0.16445132740357596
  (3, 1896)	0.18819706279852946
  (3, 7054)	0.18819706279852946
  (3, 3913)	0.18819706279852946
  (3, 6612)	0.11819529570773202
  (3, 5698)	0.18819706279852946
  :	:
  (4443, 2392)	0.525976588252023
  (4443, 6567)	0.3515562006830387
  (4443, 6114)	0.36254092215253014
  (4444, 234)	0.37395022238866327
  (4444, 2130)	0.35653633973394605
  (4444, 2687)	0.34418099336415503
  (

In [25]:
print(X_test_features)

  (0, 7364)	0.17363011260734035
  (0, 7183)	0.21818193369124278
  (0, 7122)	0.1481273365578714
  (0, 6949)	0.13108782173708264
  (0, 6874)	0.18105277300124878
  (0, 6719)	0.29426753350904916
  (0, 6718)	0.21818193369124278
  (0, 6717)	0.2016369569866806
  (0, 6567)	0.1511698980562992
  (0, 5611)	0.25192863192974024
  (0, 5540)	0.15792464852488608
  (0, 5403)	0.25192863192974024
  (0, 4829)	0.2708416413045962
  (0, 4799)	0.23100732376725258
  (0, 4606)	0.16072964715389373
  (0, 4599)	0.216448404047274
  (0, 2943)	0.29426753350904916
  (0, 2833)	0.23987429625172696
  (0, 1309)	0.192187989973568
  (0, 1014)	0.20389088659036247
  (0, 479)	0.28056426502350806
  (1, 7197)	0.2186500092858096
  (1, 6147)	0.22259703961362995
  (1, 5700)	0.19919689190095893
  (1, 4876)	0.234711838651194
  :	:
  (1107, 4403)	0.5089095577906074
  (1107, 3904)	0.21223006816490234
  (1107, 2888)	0.2969023608815326
  (1107, 2539)	0.2589904865096283
  (1107, 2150)	0.2756180403000198
  (1107, 1857)	0.22657441036926962


Training the model

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train_features, Y_train)

Evaluating The trained Model

In [28]:
#prediction on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [29]:
print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  0.9685181020912975


In [30]:
#prediction on testing data
prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(Y_test, prediction_on_testing_data)

In [31]:
print('Accuracy on test data: ', accuracy_on_testing_data)

Accuracy on test data:  0.9577338129496403


Building a Predictive System

In [32]:
input_sms = ["Just forced myself to eat a slice. I'm really not hungry tho. This sucks. Mark is getting worried. He knows I'm sick when I turn down pizza. Lol"]

#Convert text to feature vectors
input_sms_feature = feature_extraction.transform(input_sms)

#Makin Prediction
prediction = model.predict(input_sms_feature)
print(prediction)

if prediction[0] == 1:
  print('Ham SMS')

else:
  print('Spam SMS')

[1]
Ham SMS
