In [1]:
#Import libraries needed
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

In [2]:
df = pd.read_csv('spam.csv',sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.describe()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
#Check for null values
df.isnull().sum()

label            0
message          0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [7]:
#Counting the labels
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
#Split data for test and train set
X = df['message']
Y = df['label']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=3)

In [10]:
#Counting Vectorizer
count_vext=CountVectorizer()

In [11]:
X[:5]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [12]:
#Fitting the data to vectorizer which is building the voacb and count the number of words
count_vext.fit(X_train)
X_train_counts=count_vext.transform(X_train)

In [13]:
X_train_counts

<4457x7777 sparse matrix of type '<class 'numpy.int64'>'
	with 59132 stored elements in Compressed Sparse Row format>

In [14]:
X_train.shape

(4457,)

In [15]:
#TfidfTransformer will systematically compute word  counts using CountVectorizer
tfidf_transformer=TfidfTransformer()

In [16]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [17]:
X_train_tfidf.shape

(4457, 7777)

In [18]:
#TfidfVectorizer tokinizes documents
vectorizer=TfidfVectorizer()

In [19]:
X_train_tfidf=vectorizer.fit_transform(X_train)

In [20]:
#Using linearSVC to fit the dataset that we provide
clf=LinearSVC()

In [21]:
clf.fit(X_train_tfidf,Y_train)

In [22]:
text_clf=Pipeline([('tfidf', TfidfVectorizer()), ('clf',LinearSVC())])

In [23]:
text_clf.fit(X_train, Y_train)

In [24]:
predictions=text_clf.predict(X_test)

In [25]:
print(confusion_matrix(Y_test, predictions))

[[960   0]
 [ 12 143]]


In [26]:
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       960
        spam       1.00      0.92      0.96       155

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [27]:
#Accuracy Check  and predictions
metrics.accuracy_score(Y_test,predictions)

0.989237668161435

In [28]:
text_clf.predict(["Hello, my Name is Kratos"])

array(['ham'], dtype=object)

In [29]:
text_clf.predict(["Free 999Gb of data with Globe Mobile Phone"])

array(['spam'], dtype=object)

In [31]:
df.to_csv(r'C:\Users\charl\Desktop\cambri_spam_output.csv', index = False)