In [19]:
import coremltools as cml

In [1]:
#Sklearn and numpy
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [3]:
# Reading in and parsing data
raw_data = open('SMSSpamCollection.txt', 'r')
sms_data = []
for line in raw_data:
    split_line = line.split("\t")
    sms_data.append(split_line)

In [7]:
sms_data = np.array(sms_data)
X = sms_data[:, 1]
y = sms_data[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=22)

print (X_train)
print (y_train)

['Chinatown got porridge, claypot rice, yam cake, fishhead beehoon... Either we eat cheap den go cafe n tok or go nydc or somethin...\n'
 'Really? I crashed out cuddled on my sofa.\n'
 "Lol they don't know about my awesome phone. I could click delete right now if I want.\n"
 ...
 'Thank You for calling.Forgot to say Happy Onam to you Sirji.I am fine here and remembered you when i met an insurance person.Meet You in Qatar Insha Allah.Rakhesh, ex Tata AIG who joined TISSCO,Tayseer.\n'
 'Then anything special?\n'
 "Only 2% students solved this CAT question in 'xam... 5+3+2= &lt;#&gt;  9+2+4= &lt;#&gt;  8+6+3= &lt;#&gt;  then 7+2+5=????? Tell me the answer if u r brilliant...1thing.i got d answr.\n"]
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [8]:
# Building Pipelines
pipeline_1 = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB())])
pipeline_2 = Pipeline([('vect', TfidfVectorizer()),('clf', MultinomialNB())])
pipeline_3 = Pipeline([('vect', CountVectorizer()),('clf', LinearSVC())])
pipeline_4 = Pipeline([('vect', TfidfVectorizer()),('clf', LinearSVC())])
pipeline_5 = Pipeline([('vect', CountVectorizer()),('clf', RandomForestClassifier())])
pipeline_6 = Pipeline([('vect', TfidfVectorizer()),('clf', RandomForestClassifier())])
pipelines = [pipeline_1, pipeline_2, pipeline_3, pipeline_4, pipeline_5, pipeline_6]

In [9]:
for pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))

             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       730
       spam       0.96      0.91      0.93       107

avg / total       0.98      0.98      0.98       837

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98       730
       spam       1.00      0.69      0.82       107

avg / total       0.96      0.96      0.96       837

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       730
       spam       0.99      0.89      0.94       107

avg / total       0.98      0.98      0.98       837

             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       730
       spam       0.98      0.90      0.94       107

avg / total       0.98      0.98      0.98       837

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98       730
       spam       1.00      0.75 

In [16]:
# Creating and saving an .mlmodel file and a list of words
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(X)
words = open('words_ordered.txt', 'w')
count = 1
for feature in vectorizer.get_feature_names():
    if count < 3:
        print(feature)
        count = count + 1
    words.write(feature + '\n')
words.close()

00
000


In [20]:
model = LinearSVC()
model.fit(vectorized, y)
coreml_model = cml.converters.sklearn.convert(model, "message", 'label')
coreml_model.save('MessageClassifier.mlmodel')