In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score,recall_score, confusion_matrix

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv', encoding='latin-1')

In [4]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ['label', 'text']

In [5]:
df["label_int"] = df["label"].map({'ham':0, 'spam':1})

In [6]:
df

Unnamed: 0,label,text,label_int
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [7]:
text = list(df['text'])

In [9]:
df_stop_words = pd.DataFrame({"Words":stopwords.words('english')})

In [11]:
df_stop_words.to_excel('stop_words.xlsx', index = False)

In [12]:
lemmatizer = WordNetLemmatizer()

corpus = []
for i in range(len(text)):
    r = re.sub('[^a-zA-Z]', ' ', text[i])
    r = r.lower()
    r = r.split()
    r = [word for word in r if word not in stopwords.words('english')]
    r = [lemmatizer.lemmatize(word) for word in r]
    r = ' '.join(r)
    corpus.append(r)

In [13]:
df['text_clean'] = corpus
df.head()

Unnamed: 0,label,text,label_int,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,0,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think go usf life around though


In [14]:
# Create Feature and Label sets
X = df['text_clean']
y = df['label_int']

# train test split (80% train - 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print('Training Data :', X_train.shape)
print('Testing Data : ', X_test.shape)

Training Data : (3900,)
Testing Data :  (1672,)


In [15]:
tfidf_vect = TfidfVectorizer(analyzer='word')
xtrain_tfidf =  tfidf_vect.fit_transform(X_train)
xtest_tfidf =  tfidf_vect.transform(X_test)

In [16]:
xtrain_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
xtest_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
xtrain_tfidf.shape

(3900, 5850)

## Regresion Logística

In [18]:
model = LogisticRegression()
model.fit(xtrain_tfidf, y_train)

In [19]:
predictions = model.predict(xtest_tfidf)

In [20]:
df_confusion = pd.DataFrame(confusion_matrix(y_test, predictions), index=['ham','spam'], columns=['ham_pred','spam_pred'])
df_confusion

Unnamed: 0,ham_pred,spam_pred
ham,1449,4
spam,51,168


In [21]:
balanced_acc = balanced_accuracy_score(y_test, predictions)
sensitivity = recall_score(y_test, predictions)
print("Training Score:", model.score(xtrain_tfidf, y_train))
print("Testing Score:", model.score(xtest_tfidf, y_test))
print("Ham/Spam Prediction Score on Test Data:", balanced_acc)
print("Sensitivity (Recall):", sensitivity)

Training Score: 0.97
Testing Score: 0.9671052631578947
Ham/Spam Prediction Score on Test Data: 0.8821851813442193
Sensitivity (Recall): 0.7671232876712328


## Maquina de Vectores de Soporte

In [22]:
model = LinearSVC()
model.fit(xtrain_tfidf, y_train)

In [23]:
predictions = model.predict(xtest_tfidf)

In [24]:
df_confusion = pd.DataFrame(confusion_matrix(y_test, predictions), index=['ham','spam'], columns=['ham_pred','spam_pred'])
df_confusion

Unnamed: 0,ham_pred,spam_pred
ham,1451,2
spam,23,196


In [25]:
balanced_acc = balanced_accuracy_score(y_test, predictions)
sensitivity = recall_score(y_test, predictions)
print("Training Score:", model.score(xtrain_tfidf, y_train))
print("Testing Score:", model.score(xtest_tfidf, y_test))
print("Ham/Spam Prediction Score on Test Data:", balanced_acc)
print("Sensitivity (Recall):", sensitivity)

Training Score: 0.9992307692307693
Testing Score: 0.9850478468899522
Ham/Spam Prediction Score on Test Data: 0.9468003532291873
Sensitivity (Recall): 0.8949771689497716


In [28]:
import joblib

In [29]:
joblib.dump(model, 'modelo_svm.pkl')

['modelo_svm.pkl']

## XGBoost

In [None]:
dtrain = xgb.DMatrix(xtrain_tfidf, label=y_train)
dtest = xgb.DMatrix(xtest_tfidf, label=y_test)

In [None]:
params = {
    "objective": "binary:logistic",
    "max_depth": 6,
    "eta": 0.1,
    "eval_metric": "logloss"
}

In [None]:
model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
predictions = model.predict(dtest)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
balanced_acc = balanced_accuracy_score(y_test, predictions)
sensitivity = recall_score(y_test, predictions)
print("Ham/Spam Prediction Score on Test Data:", balanced_acc)
print("Sensitivity (Recall):", sensitivity)

Ham/Spam Prediction Score on Test Data: 0.911380228887135
Sensitivity (Recall): 0.8303571428571429
