## Making our first model
Classifing if a text belongs to spam or not

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv', encoding="latin-1")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [None]:
747/4825

0.15481865284974095

**15% Spam emails 85% ham emails,This means class imbalance**

In [None]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

(747, 2)

In [None]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

(4825, 2)

In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [None]:
df_balanced = pd.concat([df_ham_downsampled,df_spam])
df_balanced.shape

(1494, 2)

In [None]:
df_balanced['Category'].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

## Changing the names of the attributes

In [None]:
df_balanced.rename(columns = {'Category' : 'target', 'Message' : 'text'},inplace = True)
df_balanced.head(5)

Unnamed: 0,target,text
4031,ham,"Cool, I'll text you in a few"
105,ham,Umma my life and vava umma love you lot dear
2948,ham,Leave it. U will always be ignorant.
2508,ham,Yup...
3847,ham,Pls go there today &lt;#&gt; . I dont want an...


In [None]:
df_balanced['target'] = df_balanced['target'].map({'ham':0,'spam':1})
df_balanced.head()

Unnamed: 0,target,text
4031,0,"Cool, I'll text you in a few"
105,0,Umma my life and vava umma love you lot dear
2948,0,Leave it. U will always be ignorant.
2508,0,Yup...
3847,0,Pls go there today &lt;#&gt; . I dont want an...


In [None]:
print(df_balanced.iloc[0]['text'])

Cool, I'll text you in a few


## Data preprocessing

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 1. Remove punctuations
df_balanced['text'] = df_balanced['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# 2. Convert to lowercase
df_balanced['text'] = df_balanced['text'].apply(lambda x: x.lower())

# 3. Remove stopwords
stop_words = set(stopwords.words('english'))

df_balanced['text'] = df_balanced['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# 4. Remove numbers
df_balanced['text'] = df_balanced['text'].apply(lambda x: re.sub(r'\d+', '', x))

# 5. Remove words less than 2 letters
df_balanced['text'] = df_balanced['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))

In [None]:
df_balanced.head()

Unnamed: 0,target,text
4031,0,cool ill text
105,0,umma life vava umma love lot dear
2948,0,leave always ignorant
2508,0,yup
3847,0,pls today ltgt dont want excuses


## Building a Machine learning model

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
x = df_balanced['text']
y = df_balanced['target']
ran_state = 2023
t_size = 0.2

cv = CountVectorizer()
x = cv.fit_transform(x)


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = t_size,random_state = ran_state,stratify = y)

## 1) Using Logistic Regression

In [None]:
# 1. Train the model
lr_model = LogisticRegression()

lr_model.fit(x_train, y_train)

# 2. Make prediction
y_lr_model_predict = lr_model.predict(x_test)

In [None]:
# 3. Evaluate the model
print('Logistic Regression:')
print('Accuracy:', accuracy_score(y_test, y_lr_model_predict))
print('Precision:', precision_score(y_test, y_lr_model_predict))
print('Recall:', recall_score(y_test, y_lr_model_predict))
print('F1 score:', f1_score(y_test, y_lr_model_predict))

Logistic Regression:
Accuracy: 0.959866220735786
Precision: 1.0
Recall: 0.92
F1 score: 0.9583333333333334


## 2) Using Linear SVM

In [None]:
svm_model = LinearSVC()
svm_model.fit(x_train, y_train)
y_svm_model_predict = svm_model.predict(x_test)

In [None]:
# Evaluate the model
print('Linear SVM:')
print('Accuracy:', accuracy_score(y_test, y_svm_model_predict))
print('Precision:', precision_score(y_test, y_svm_model_predict))
print('Recall:', recall_score(y_test, y_svm_model_predict))
print('F1 score:', f1_score(y_test, y_svm_model_predict))

Linear SVM:
Accuracy: 0.959866220735786
Precision: 1.0
Recall: 0.92
F1 score: 0.9583333333333334


## Use model on real-life data

In [None]:
# 1. Serialize the best model
import joblib

In [None]:
model_file_name = "svm_best_model.joblib"
model_folder = 'Gradio'
joblib.dump(svm_model, model_folder+''+model_file_name)

['Gradiosvm_best_model.joblib']

In [None]:
# 2. Load the model
loaded_svm_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))

def make_prediction(input_text):

    preprocessed_text = cv.transform([input_text])

    prediction = loaded_svm_model.predict(preprocessed_text)

    info = ''

    if(prediction[0] == 0):
        info = "Ham ✅"

    else:
        info = "Spam 🚨"

    final_info = "This message is a : {}".format(info)

    return final_info



text_example = "Thank you for your message. You have won $1000000000 send your bank information asap!!!"

print(make_prediction(text_example))

This message is a : Spam 🚨


In [None]:
import gradio as gr

In [None]:
headline = "Text classification Spam-ham"

iface = gr.Interface(fn=make_prediction, inputs="text", outputs="text", title = headline)
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://96ce61340acfb9dfe9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
headline = "Text classification fields"

iface = gr.Interface(fn=predict_categories, inputs="text", outputs="text", title = headline)
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
