In [7]:
import pandas as pd
import csv
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sklearn
from sklearn.model_selection import train_test_split 
from scipy import sparse

In [8]:
messages = pd.read_csv('SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])

In [9]:
def split_into_lemmas(message):
    message = str.format(message,'utf-8').lower()
    words = TextBlob(message).words 
    return [word.lemma for word in words]

In [None]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])

In [11]:
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [12]:
messages_train, messages_test, y_train, y_test = train_test_split(messages_tfidf, messages['label'], test_size= 0.2)

In [13]:
message_train, messages_valid, classify_train, classify_valid = train_test_split(messages_train, y_train, test_size=0.2)

In [14]:
train_dataset = pd.DataFrame(message_train.todense())
classify_df = classify_train.to_frame()
classify_df = classify_df.reset_index(drop=True)
train_data = pd.concat([train_dataset,classify_df],axis = 1)

In [15]:
test_dataset = pd.DataFrame(messages_test.todense())
test_classify = y_test.to_frame()
test_classify = test_classify.reset_index(drop=True)
test_data = pd.concat([test_dataset,test_classify],axis = 1)

In [16]:
valid_dataset = pd.DataFrame(messages_valid.todense())
valid_classify = classify_valid.to_frame()
valid_classify = valid_classify.reset_index(drop=True)
valid_data = pd.concat([valid_dataset,valid_classify],axis = 1)

In [17]:
train_data.to_csv('train.csv')
test_data.to_csv('test.csv')
valid_data.to_csv('validation.csv')