In [19]:
documents = {
    'news': 'Codeup announced last thursday that they just launched a new data science program. It is 18 weeks long.',
    'description': 'Codeup\'s data science program teaches hands on skills using Python and pandas.',
    'context': 'Codeup\'s data science program was created in response to a percieved lack of data science talent, and growing demand.'
}

In [16]:
import pandas as pd


tf = pd.Series(documents['context'].split()).value_counts(normalize=True)
tf

data         0.105263
science      0.105263
in           0.052632
and          0.052632
was          0.052632
to           0.052632
talent,      0.052632
a            0.052632
created      0.052632
percieved    0.052632
growing      0.052632
Codeup's     0.052632
of           0.052632
program      0.052632
response     0.052632
demand.      0.052632
lack         0.052632
dtype: float64

In [45]:
from math import log
import re

entire_text = ' '.join(list(documents.values()))

words = re.sub(r'[^\w\s]', '', entire_text.lower()).split()

unique_words = list(set(words))

for word in unique_words:
    n_documents_this_word_appears_in = 0
    for doc in documents.values():
        if word in doc.lower():
            n_documents_this_word_appears_in += 1
    document_frequency = n_documents_this_word_appears_in
    idf = log(len(documents) / (document_frequency + .1))
    print(word, idf)

hands 1.0033021088637848
lack 1.0033021088637848
that 1.0033021088637848
data -0.032789822822990956
new 1.0033021088637848
was 1.0033021088637848
announced 1.0033021088637848
long 1.0033021088637848
just 1.0033021088637848
and 0.3566749439387324
is 1.0033021088637848
last 1.0033021088637848
on -0.032789822822990956
percieved 1.0033021088637848
growing 1.0033021088637848
talent 1.0033021088637848
18 1.0033021088637848
to 1.0033021088637848
it 1.0033021088637848
a -0.032789822822990956
demand 1.0033021088637848
teaches 1.0033021088637848
of 1.0033021088637848
skills 1.0033021088637848
response 1.0033021088637848
codeups 3.4011973816621555
they 1.0033021088637848
weeks 1.0033021088637848
codeup -0.032789822822990956
python 1.0033021088637848
launched 1.0033021088637848
thursday 1.0033021088637848
created 1.0033021088637848
science -0.032789822822990956
pandas 1.0033021088637848
using 1.0033021088637848
program -0.032789822822990956
in 0.3566749439387324


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(documents.values())
sparse_matrix = tfidf.transform(documents.values())
tfidf_values = sparse_matrix.todense()
sparse_matrix

<3x36 sparse matrix of type '<class 'numpy.float64'>'
	with 45 stored elements in Compressed Sparse Row format>

In [88]:
sparse_matrix.todense()

matrix([[0.26356607, 0.        , 0.26356607, 0.15566636, 0.        ,
         0.15566636, 0.        , 0.        , 0.        , 0.        ,
         0.26356607, 0.26356607, 0.26356607, 0.        , 0.26356607,
         0.26356607, 0.26356607, 0.26356607, 0.        , 0.        ,
         0.        , 0.        , 0.15566636, 0.        , 0.        ,
         0.15566636, 0.        , 0.        , 0.        , 0.26356607,
         0.26356607, 0.26356607, 0.        , 0.        , 0.        ,
         0.26356607],
        [0.        , 0.25387968, 0.        , 0.19716022, 0.        ,
         0.19716022, 0.        , 0.        , 0.33382127, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.33382127,
         0.33382127, 0.        , 0.19716022, 0.33382127, 0.        ,
         0.19716022, 0.33382127, 0.        , 0.33382127, 0.        ,
         0.        , 0.        , 0.        , 0.33382127, 0.        ,
         0. 

In [59]:
pd.DataFrame(tfidf_values, columns=tfidf.get_feature_names())

Unnamed: 0,18,and,announced,codeup,created,data,demand,growing,hands,in,...,skills,talent,teaches,that,they,thursday,to,using,was,weeks
0,0.263566,0.0,0.263566,0.155666,0.0,0.155666,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.263566,0.263566,0.263566,0.0,0.0,0.0,0.263566
1,0.0,0.25388,0.0,0.19716,0.0,0.19716,0.0,0.0,0.333821,0.0,...,0.333821,0.0,0.333821,0.0,0.0,0.0,0.0,0.333821,0.0,0.0
2,0.0,0.195932,0.0,0.152159,0.257627,0.304317,0.257627,0.257627,0.0,0.257627,...,0.0,0.257627,0.0,0.0,0.0,0.0,0.257627,0.0,0.257627,0.0


In [67]:
from sklearn.model_selection import train_test_split

spam = pd.read_csv('./spam_clean.csv')
spam

X_train, X_test, y_train, y_test = train_test_split(spam.text, spam.label, stratify=spam.label, random_state=123)



In [96]:
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer()
tfidf.fit(X_train)
train_tfidf_values = tfidf.transform(X_train)

model = LogisticRegression()
model.fit(train_tfidf_values, y_train)

predictions = model.predict(train_tfidf_values)



In [97]:
df = pd.DataFrame(dict(actual=y_train, predicted=predictions))

pd.crosstab(df.predicted, df.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3618,110
spam,1,450


In [83]:
from sklearn.metrics import classification_report

print(classification_report(df.actual, df.predicted))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3619
        spam       1.00      0.80      0.89       560

   micro avg       0.97      0.97      0.97      4179
   macro avg       0.98      0.90      0.94      4179
weighted avg       0.97      0.97      0.97      4179



In [101]:
test_tfidf_values = tfidf.transform(X_test)
test_predictions = model.predict(test_tfidf_values)

print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       0.98      0.85      0.91       187

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.98      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [105]:
def predict(unknown_text):
    return model.predict(tfidf.transform([unknown_text]))[0]

In [109]:
predict('free cash prize')

'spam'