# LIBRARIES

In [1]:
import numpy as np
from keras.models import Sequential
from keras import layers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import pandas as pd

Using TensorFlow backend.


# READ CSV

In [29]:
# Drop NaN because they are in the target variable.
# We have to predict unlabeled so I saved them in another df

df=pd.read_csv("sms.csv",encoding='cp1252').drop("No",axis=1)
unlabeled=df[['RESULT','SMS']].fillna('unlabeled')
unlabeled=unlabeled[unlabeled['RESULT']=='unlabeled']
df=df.dropna().reset_index(drop=True)

df

Unnamed: 0,RESULT,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
print('classes: '+str(df['RESULT'].unique()))
print('%ham: '+str(len(df[df['RESULT']=='ham'])/len(df)))
print('%spam: '+str(len(df[df['RESULT']=='spam'])/len(df)))

# Dataset is unbalanced

classes: ['ham' 'spam']
%ham: 0.8664259927797834
%spam: 0.13357400722021662


# TEXT PREPROCESSING

In [4]:
# This is a class for cleaning and vectorizing textual data. I have helped 
# the owner to insert new things in my last job.

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#import spacy


class text_preprocessing():
    """ This class the preprocessing of a documents (text field) that are contained into a pandas DataFrame.
        It's possible to perform the following step:
            1. Text standardization with data cleaning (removal of special characters, numbers, link etc.).
            2. Removal of stopwords.
            3. Lemmatization.

    Args:
    -----
        lemmatization (boolean) >>> Set it to True if you want to perform the lemmatization step.
                                    Defalt is equal to False.
        standardize (boolean)   >>> Set it to True if you want to perform the standardization step.
                                    Defalt is equal to True.
        chr_to_remove (list)    >>> List of strings (regex) that represent the vector of special char or string that
                                    you want to remove.
        chr_to_keep (regex)    >>> regex that represent the char that you want to keep. By default this method return 
                                   only the letters of english alphabet. If you want to keep another special character 
                                   you can specify this by setting this field. 
        language  (string)      >>> Optional. Language of the text that you want to analyze.
                                    Default is 'en' (English).
    """
     
    def __init__(self, lemmatization = False, 
                standardize = True,
                stopwords = True,
                chr_to_remove = [r"http\S+", r"http", r"@\S+", r"@", r""],
                chr_to_keep = r"[^A-Za-z]",
                language = 'en'):
        
        self.lemmatization = lemmatization
        self.stopwords = stopwords
        self.standardize = standardize
        self.chr_to_remove = chr_to_remove
        self.chr_to_keep = chr_to_keep
        self.language = language

    def is_null(self, text):
        return text.isspace()

    def standardize_text(self, df, text_field):

        for regexp in self.chr_to_remove:
            df[text_field] = df[text_field].str.replace(regexp, "")

        df[text_field] = df[text_field].str.replace(self.chr_to_keep, " ") #we not consider numbers 
        df[text_field] = df[text_field].str.lower()
        return df

    def remove_stopwords(self, text, stopwords):
        clearlist = [word for word in text if word not in stopwords]
        return clearlist
    '''
    def lemmatizer(self, text):
        if self.language == 'it':
            nlp = spacy.load('it_core_news_sm')
        else:
            nlp = spacy.load(self.language)
        sent = []
        doc = nlp(" ".join(text))
        for word in doc:
            sent.append(word.lemma_)
        return sent
    '''
    def fit(self, data_df, field):
        """
        Args:
        -----
            data_df (pandas.DataFrame) >>> dataframe that contains the documents and the text field to process.
            field (string) >>> name of the field (column) that contain the text to process.

        Returns:
        --------
            pandas.DataFrame that are the copy of the original dataframe plus a column that contain the clean tokens
            ("tokens") and (if computed) another field with the lemma of these tokens ("lemma").
        """

        # Drop fields which contain only space char
        print("Data cleaning...")
        df = data_df[~data_df[field].apply(self.is_null)]

        # Standardization
        if self.standardize:
            print("Standardization...")
            df = self.standardize_text(df, field)

        # Token extraction
        tokenizer = RegexpTokenizer(r'\w+')
        print("Tokenization...")
        df["tokens"] = df[field].apply(tokenizer.tokenize)

        # Stopwords
        if self.stopwords:
            if self.language == 'en':
                stoplist = stopwords.words('english')
            elif self.language == 'it':
                stoplist = stopwords.words('italian')
            else:
                raise Exception("Invalid language")
            print("Removing stopwords...")
            df["tokens"] = df["tokens"].apply(self.remove_stopwords, stopwords=stoplist)

        # Lemmatization
        if self.lemmatization:
            print("Lemmatization...")
            df["lemma"] = df["tokens"].apply(self.lemmatizer)
        print("Finish")

        return df


class vectorize_data():

    """ This class compute the vectorization of a list of text tokens.
    Args:
    -----
        method (string) >>> The metric used to transform the feature data. The choices are "binary", "tf" or "tf-idf".
                            Default is "tf-idf".

    """

    def __init__(self, method='tf-idf'):
        self.method = method

    def fit(self, train_data, test_data):
        """
        Args:
        -----
            train_data (pandas.Series) >>> column of the training set dataframe that contains the tokens to process.
            test_data (pandas.Series) >>> column of the test set dataframe that contains the tokens to process.

        Returns:
        --------
            a tuple contains the two feature matrix (training data and test data).
        """

        if self.method == 'binary':
            count_vectorizer = TfidfVectorizer(binary=True, use_idf=False, norm=None, max_features=60000)
        elif self.method == 'tf':
            count_vectorizer = TfidfVectorizer(use_idf=False, max_features=60000)
        elif self.method == 'tf-idf':
            count_vectorizer = TfidfVectorizer(max_features=60000)
        else:
            raise Exception("Invalid method. Use: binary, tf or tf-idf")

        transformed_train_data = count_vectorizer.fit_transform(train_data.apply(str))
        transformed_test_data = count_vectorizer.transform(test_data.apply(str))

        return transformed_train_data, transformed_test_data, count_vectorizer


In [5]:
# Use class to clean and tokenize sentences 

var=text_preprocessing()
df=var.fit(df,'SMS')

# I removed stopwords because they aren't usefull for the model (for
# example "the", "in", "a"... there are many but they usually don't
# give usefull informations if sentence is spam or no spam.)

# Also I standardize sentences with remove special character, number
# character, traform word to lower cases.

# Tokenization is for trasform text to List.

Data cleaning...
Standardization...
Tokenization...
Removing stopwords...
Finish


In [6]:
# function for convert a list to words to a Sentence
def detokenizer(x): return " ".join(x)

df['SMS_R']=df['tokens'].apply(detokenizer) 

# For Neural Network I need a numeric target variable

def class_to_number(x):
    if x=='ham':
        return 0
    else:
        return 1
    
df['RESULT']=df['RESULT'].apply(class_to_number)


df



Unnamed: 0,RESULT,SMS,tokens,SMS_R
0,0,go until jurong point crazy available only ...,"[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,...",free entry wkly comp win fa cup final tkts st ...
3,0,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,0,nah i don t think he goes to usf he lives aro...,"[nah, think, goes, usf, lives, around, though]",nah think goes usf lives around though
5,1,freemsg hey there darling it s been week s n...,"[freemsg, hey, darling, week, word, back, like...",freemsg hey darling week word back like fun st...
6,0,even my brother is not like to speak with me ...,"[even, brother, like, speak, treat, like, aids...",even brother like speak treat like aids patent
7,0,as per your request melle melle oru minnamin...,"[per, request, melle, melle, oru, minnaminungi...",per request melle melle oru minnaminunginte nu...
8,1,winner as a valued network customer you have...,"[winner, valued, network, customer, selected, ...",winner valued network customer selected receiv...
9,1,had your mobile months or more u r entitle...,"[mobile, months, u, r, entitled, update, lates...",mobile months u r entitled update latest colou...


# DEEP LEARNING

In [7]:

# take sentences and target variable
sentences = df['SMS_R'].values
y = df['RESULT'].values


# I used a CountVectorizer that trasform for each word in sentence "s"
# to couple (word, #word_in_s). These vectorizer is necessary because
# for neural network we need a numberic input. There are other
# techniques for this, like term frequeny - Inverse document Frequency
# that is better because consider firstly relative frequency instead
# of absolute frequency and after put a weight inverse proportional
# of frequency of a word.

vectorizer = CountVectorizer()
vectorizer.fit(sentences)

accuracy_list=[]        #cross validation
for k in range(0,10):
    print(k)
    
    # split dataset of sentences in train and test
    
    sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000) 
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)




    
    # Use a simple Deep Learning Architecture with a "relu" activation
    # function in first layer and a activation "sigmoid" for last.
    # I used DropOut for evade overfitting.
    
    input_dim = X_train.shape[1]  # Number of features
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dropout(0.60))
    model.add(layers.Dense(1, activation='sigmoid'))

    #optimizer adam and a loss function for a classification proble
    
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                   metrics=['accuracy'])
    model.summary()

    #100 epoches
    
    history = model.fit(X_train, y_train,
                        epochs=100,
                         verbose=False,
                         validation_data=(X_test, y_test),
                         batch_size=10)

    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    accuracy_list.append(accuracy)
    

0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                75650     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 75,661
Trainable params: 75,661
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Training Accuracy: 0.9995
Testing Accuracy:  0.9841
1
_________________________________________________________________
Layer (type)                 Output Shape    

Training Accuracy: 0.9995
Testing Accuracy:  0.9834


In [8]:
# cross validation accuracy

print("cv_accuracy: "+str(np.array(accuracy_list).mean())+" +- "+str(np.array(accuracy_list).std()))


cv_accuracy: 0.9826714801874401 +- 0.0014440433212996263


# SAVE CSV

In [30]:
var=text_preprocessing()
unlabeled=var.fit(unlabeled,'SMS')
unlabeled['SMS_R']=unlabeled['tokens'].apply(detokenizer)
sentences = unlabeled['SMS_R'].values
y = unlabeled['RESULT'].values
X= vectorizer.transform(sentences)

unlabeled['RESULT_pred']=model.predict(X).astype(int)


Data cleaning...
Standardization...
Tokenization...
Removing stopwords...
Finish


In [37]:
unlabeled=pd.DataFrame(unlabeled['RESULT_pred'])
def number_to_class(x):
    if x==0:
        return 'ham'
    else:
        return 'spam'
    
unlabeled['RESULT']=unlabeled['RESULT_pred'].apply(number_to_class)
unlabeled['RESULT'].to_csv('NLP.csv')





95      spam
124      ham
347      ham
567      ham
598     spam
733      ham
940      ham
1099     ham
1154     ham
1349     ham
1531     ham
1724     ham
1920     ham
1965     ham
2186     ham
2413    spam
2435     ham
2820     ham
3107     ham
3407     ham
3637     ham
3799     ham
3824     ham
3841     ham
4047    spam
4211     ham
4240     ham
4529     ham
5050     ham
5236     ham
5365    spam
5547    spam
Name: RESULT, dtype: object