In [1]:
#import data
import pandas as pd

#EDA
import seaborn as sns
import matplotlib.pyplot as plt


#clean data
from nltk.stem import SnowballStemmer
import string
import re
import nltk

#vectorization
from sklearn.feature_extraction.text import CountVectorizer


#model 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

#model accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score



# 1.) Import data

In [None]:
#Reading the File from where the model will be trained.

data = pd.read_csv('SMSSpamCollection.txt', sep='\t')
data.head()


# 2.) EDA

In [None]:
x=data['label'].value_counts()
sns.barplot(x.index,x)

#### Non SPAM Text(0) are More in the given dataset than the SPAM text(1)
#### Dataset is biased

In [None]:
# PLOTTING WORD-COUNT
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=data[data['label']==1]['word_count']
ax1.hist(train_words,color='red')
ax1.set_title('SPAM text')

train_words=data[data['label']==0]['word_count']
ax2.hist(train_words,color='green')
ax2.set_title('Non-SPAM texts')
fig.suptitle('Words per texts')
plt.show()

#### Non SPAM Text(0) are longer than the SPAM text(1)

# 3.) Data Cleaning

In [None]:
#converting label column into binary 0 or 1.

data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data.head()


def cleaning (text):
    stopwords = nltk.corpus.stopwords.words('english')
    stemmer = SnowballStemmer('english')
    text = text.lower()
    text = "".join([i for i in text if i not in string.punctuation])
    text = re.split('W+',text)
    text = [stemmer.stem(word) for word in text]
    text= [i for i in text if i not in stopwords]
    text = " ".join(text)
    return(text)
    


data['text']= data['text'].apply(lambda x: cleaning(x))
data.head()


# 4.) Word to Number

In [None]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(data['text'])
vectorizer.transform(data['text']).toarray()
vectorizer = CountVectorizer()


X = data.text.values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

# 5.) Model

In [None]:
#Random Forest

classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

# 6.) New String

In [None]:
testing = "##type your SMS HERE"
#Pre-processing the new string
testing = [cleaning (testing)]

#converting words to numerical data using tf-idf
X_vector = vectorizer.transform(testing)

#use the best model to predict 'target' value for the new dataset 
y_predict = classifier.predict(X_vector)      
y_prob = classifier.predict_proba(X_vector)[:,1]

print(y_predict)
print(y_prob)