# SMS Spam classification

Run the following cell to complete data and english language spaCy model load.

In [1]:
import pandas as pd
import spacy
import numpy as np

data = pd.read_csv("spam_data.csv")

nlp = spacy.load("en_core_web_lg")

with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in data.Message])

print(data.head(5))

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
from sklearn.model_selection import train_test_split

X=doc_vectors
y=data.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=388,
                                                        stratify=y, shuffle=True)
y_train.reset_index(drop=True,inplace=True)

print(f"y list shape: {y.shape} \ny_train list shape: {y_train.shape}")

y list shape: (5572,) 
y_train list shape: (3900,)


In [None]:
# Importing models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Model evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

mlp = LogisticRegression().fit(X_train,y_train)

print("Starting grid search")

# parameter_space = {
#     'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
#     'activation': ['tanh', 'relu'],
#     'solver': ['sgd', 'adam'],
#     'alpha': [0.0001, 0.05],
#     'learning_rate': ['constant','adaptive'],
# }

# parameter_space = {'bootstrap': [True, False],
#  'max_depth': [10, 30, 50, 70, 90, None],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [200, 600, 1000, 1400, 1800]}

# parameter_space = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2','l1']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
parameter_space = dict(solver=solvers,penalty=penalty,C=c_values)

from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV

clf = TuneGridSearchCV(mlp, parameter_space, cv=3)
clf.fit(X_train, y_train)



# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

model=clf
print(f"------------Model: {model}------------")
print(f"Model score: {model.score(X_test,y_test)*100:.3f}%")

skf = StratifiedKFold(n_splits=10)
skf_splits = skf.get_n_splits(X_train, y_train)

skf_scores=[]


for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold , X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    model.fit(X_train_fold, y_train_fold) 
    skf_scores.append(model.score(X_test_fold,y_test_fold))

cv_scores_macro = cross_val_score(model, X, y, cv=skf_splits, scoring='f1_macro')
cv_scores_micro = cross_val_score(model, X, y, cv=skf_splits, scoring='f1_micro')

print(f"Cross-validation f1-macro score: {np.mean(cv_scores_macro) * 100:.3f}%", )
print(f"Cross-validation f1-micro score: {np.mean(cv_scores_micro) * 100:.3f}%", )
print(f"Stratified k-fold score: {np.mean(skf_scores) * 100:.3f}%\n", )
plt.figure()
plt.plot(model.loss_curve_)
plt.show()


Starting grid search


The actor or task with ID ffffffffffffffff6b98451801000000 cannot be scheduled right now. It requires {CPU: 1.000000} for placement, but this node only has remaining {node:192.168.0.12: 1.000000}, {GPU: 1.000000}, {object_store_memory: 1.513672 GiB}, {memory: 4.492188 GiB}. In total there are 0 pending tasks and 4 pending actors on this node. This is likely due to all cluster resources being claimed by actors. To resolve the issue, consider creating fewer actors or increase the resources available to this Ray cluster. You can ignore this message if this Ray cluster is expected to auto-scale.
