In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Reading and exploring data

In [2]:
data = pd.read_csv('data/train_spam.csv')

In [3]:
data.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [4]:
data.shape

(16278, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB


In [6]:
data.text_type.value_counts()

text_type
ham     11469
spam     4809
Name: count, dtype: int64

Target value is imbalanced

In [7]:
data.isna().sum()

text_type    0
text         0
dtype: int64

No missing values

### Feature engineering

In [8]:
data['label'] = data['text_type'].apply(lambda x: 0 if x == 'ham' else 1)

In [10]:
data.head()

Unnamed: 0,text_type,text,label,text_length
0,ham,make sure alex knows his birthday is over in f...,0,86
1,ham,a resume for john lavorato thanks vince i will...,0,520
2,spam,plzz visit my website moviesgodml to get all m...,1,126
3,spam,urgent your mobile number has been awarded wit...,1,139
4,ham,overview of hr associates analyst project per ...,0,733


## Preprocessing

Lets convert data to vector format using `bag-of-words` approach. First, lets write a function that will split meassge into words and delete very common words.

In [16]:
import string
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [17]:
data['clean_text'] = data['text'].apply(text_process)

In [18]:
data.sample(5)

Unnamed: 0,text_type,text,label,text_length,clean_text
11348,ham,url url date not supplied the controversial cl...,0,142,url url date supplied controversial claim coul...
13874,ham,ca for henwood engagement sandeep it probably ...,0,701,ca henwood engagement sandeep probably makes s...
3125,spam,⚠️instagram lifetime followers⚠️ ✔️500 folower...,1,596,⚠️instagram lifetime followers⚠️ ✔️500 folower...
4605,ham,listo 319 isranir rice edu demianen rice edu t...,0,672,listo 319 isranir rice edu demianen rice edu t...
8465,ham,you stayin out of trouble strangersaw dave the...,0,155,stayin trouble strangersaw dave day hes sorte...


Lets split data into training and testing sets and read train_spam data before vectorizing

In [19]:
from sklearn.model_selection import train_test_split

X, y = data['clean_text'], data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
test_data = pd.read_csv('data/test_spam.csv')
test_data['clean_text'] = test_data['text'].apply(text_process)
X_subm = test_data['clean_text']
X_subm

0       j jim whitehead ejw cse ucsc edu writes j open...
1       original message bitbitch magnesium net people...
2       java managers vince durasoft taught java class...
3                               youtuber name saiman says
4       underpriced issue high return equity oil gas a...
                              ...                        
4065    husband wifetum meri zindagi hoorwifeor kyatel...
4066    baylor enron case study cindy yes shall co aut...
4067                                   boring compared tp
4068    hellogorgeous hows fone charge lst nitw wen te...
4069    energy conference mark really swamped would li...
Name: clean_text, Length: 4070, dtype: object

In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
X_for_fit = pd.concat([X, X_subm])
vect = CountVectorizer()
vect.fit(pd.concat([X, X_subm]))
X_for_fit_dtm = vect.transform(X_for_fit)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
X_subm_dtm = vect.transform(X_subm)

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_for_fit_dtm)
tfidf_transformer.transform(X_train_dtm)
tfidf_transformer.transform(X_test_dtm)
tfidf_transformer.transform(X_subm_dtm)

<4070x60650 sparse matrix of type '<class 'numpy.float64'>'
	with 113896 stored elements in Compressed Sparse Row format>

## Building model

In [30]:
from sklearn.metrics import classification_report, roc_auc_score

### Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_dtm, y_train)
y_pred = lr_model.predict(X_test_dtm)
print(classification_report(y_pred, y_test))
print(f"ROC AUC: {roc_auc_score(y_pred, y_test)}")

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      2336
           1       0.88      0.94      0.91       920

    accuracy                           0.95      3256
   macro avg       0.93      0.95      0.94      3256
weighted avg       0.95      0.95      0.95      3256

ROC AUC: 0.9480959648600358


### Naive Bayes

In [39]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_dtm, y_train)
y_pred = nb_model.predict(X_test_dtm)

print(classification_report(y_pred, y_test))
print(f"ROC AUC: {roc_auc_score(y_pred, y_test)}")

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      2098
           1       0.95      0.80      0.87      1158

    accuracy                           0.91      3256
   macro avg       0.92      0.89      0.90      3256
weighted avg       0.92      0.91      0.91      3256

ROC AUC: 0.8898317502811296


### SVM

In [40]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train_dtm, y_train)
y_pred = svm_model.predict(X_test_dtm)

print(classification_report(y_pred, y_test))
print(f"ROC AUC: {roc_auc_score(y_pred, y_test)}")

              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2425
           1       0.82      0.97      0.89       831

    accuracy                           0.94      3256
   macro avg       0.90      0.95      0.92      3256
weighted avg       0.94      0.94      0.94      3256

ROC AUC: 0.947053481707544


### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train_dtm, y_train)
y_pred = rf_model.predict(X_test_dtm)

print(classification_report(y_pred, y_test))
print(f"ROC AUC: {roc_auc_score(y_pred, y_test)}")

              precision    recall  f1-score   support

           0       0.99      0.91      0.95      2478
           1       0.77      0.98      0.86       778

    accuracy                           0.93      3256
   macro avg       0.88      0.94      0.91      3256
weighted avg       0.94      0.93      0.93      3256

ROC AUC: 0.9436376877446983


Surprisingly, logistic regression, being simple model, showed best roc auc score

## Submission

In [45]:
X_dtm = vect.transform(X)
tfidf_transformer.transform(X_dtm)


lr_model = LogisticRegression()
lr_model.fit(X_dtm, y)
y_pred = lr_model.predict(X_subm_dtm)


In [85]:
submission = pd.concat([test_data.drop('clean_text', axis=1), pd.Series(y_pred)], axis=1)
submission = submission.rename({0: 'score'}, axis=1)
submission = submission.set_index('score')
submission

Unnamed: 0_level_0,text
score,Unnamed: 1_level_1
0,j jim whitehead ejw cse ucsc edu writes j you ...
0,original message from bitbitch magnesium net p...
0,java for managers vince durasoft who just taug...
0,there is a youtuber name saiman says
1,underpriced issue with high return on equity t...
...,...
0,husband to wifetum meri zindagi hoorwifeor kya...
0,baylor enron case study cindy yes i shall co a...
0,boring as compared to tp
0,hellogorgeous hows u my fone was on charge lst...


In [86]:
submission.to_csv('submission.csv')