In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### read the raw data

In [5]:
data = pd.read_csv('Amazon.csv')

# create a boolean indicator =True if the definition is of "machine learning"
# data['ml'] = data['helpful'] == TRUE

print(data.dtypes)
print(data.head(10))

Unnamed: 0                  int64
Id                          int64
ProductId                  object
UserId                     object
ProfileName                object
HelpfulnessNumerator        int64
HelpfulnessDenominator      int64
Score                       int64
Time                        int64
Summary                    object
Text                       object
helpScore                 float64
helpful                      bool
dtype: object
   Unnamed: 0      Id   ProductId          UserId       ProfileName  \
0      138806  138807  B000E63LME  A1CQGW1AOD0LF2  Alena K. "Alena"   
1      469680  469681  B004ZIH4KM  A37S7U1OX2MCWI        Becky Cole   
2      238202  238203  B003ZXE9QA  A2OM6G73E64EQ9              jeff   
3      485307  485308  B001RVFERK  A25W349EE97NBK          Tangent4   
4      375283  375284  B000OQZNTS  A3CPPW0HUC07YS       Amy Nicolai   
5      530491  530492  B001E5DX90  A26XS571YR9XPF          Briana B   
6      355114  355115  B000IGAE66   A1WFG6OC3PP

### <span style="color:red">create feature set X (matrix) and vector of labels L</span>

Use [feature extraction methods in scikit-learn](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction) to *vectorize* the text of machine learning/AI definitions into an X matrix. 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

L = data["helpful"] # labels
corpus = data['Text'] # corpus of reviews in words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# print(X.shape)

(455000, 109302)


### let's look at the words in the corpus of "Text"

In [8]:
print(len(vectorizer.vocabulary_))
print("* * * * *")
print(vectorizer.vocabulary_)

109302
* * * * *


### <span style="color:red">fit X, L to SVM using gradient descent</span>

[gradient descent documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)

In [9]:
# fit SVM linear classifier
from sklearn import linear_model
sgd = linear_model.SGDClassifier()
sgd.fit(X, L)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

### assess performance

In [11]:
# look at performance measures
import my_measures

performance_measures = my_measures.BinaryClassificationPerformance(sgd.predict(X), L, 'sgd')
performance_measures.compute_measures()
print(performance_measures.performance_measures)

{'Pos': 33235, 'Neg': 421765, 'TP': 5159, 'TN': 413998, 'FP': 7767, 'FN': 28076, 'Accuracy': 0.9212241758241758, 'Precision': 0.39911805663004796, 'Recall': 0.15522792237099442, 'desc': 'sgd'}


### test model on unseen definitions

In [17]:
# machine learning definitions
# Wikipedia, Expert System, Tech Emergence
ml_defs = ["I love this product!", "I hate this stuff!", "This is the best thing I've ever bought."]

for d in ml_defs:
    print(d)
    print("* * *")

I love this product!
* * *
I hate this stuff!
* * *
This is the best thing I've ever bought.
* * *


In [13]:
# AI definitions
# Wikipedia, Oxford dictionary
ai_defs = ["Artificial intelligence (AI, also machine intelligence, MI) is intelligence demonstrated by machines, in contrast to the natural intelligence (NI) displayed by humans and other animals.",
          "the theory and development of computer systems able to perform tasks that normally require human intelligence, such as visual perception, speech recognition, decision-making, and translation between languages"]

for d in ai_defs:
    print(d)
    print("* * *")

Artificial intelligence (AI, also machine intelligence, MI) is intelligence demonstrated by machines, in contrast to the natural intelligence (NI) displayed by humans and other animals.
* * *
the theory and development of computer systems able to perform tasks that normally require human intelligence, such as visual perception, speech recognition, decision-making, and translation between languages
* * *


In [14]:
# Definitions of unrelated things: kitten, piano, widget
other_defs = ["A kitten, also known as a kitty or kitty cat, is a juvenile cat.",
             "The piano is an acoustic, stringed musical instrument invented in Italy by Bartolomeo Cristofori around the year 1700 in which the strings are struck by hammers.",
             "a small gadget or mechanical device, especially one whose name is unknown or unspecified"]

for d in other_defs:
    print(d)
    print("* * *")

A kitten, also known as a kitty or kitty cat, is a juvenile cat.
* * *
The piano is an acoustic, stringed musical instrument invented in Italy by Bartolomeo Cristofori around the year 1700 in which the strings are struck by hammers.
* * *
a small gadget or mechanical device, especially one whose name is unknown or unspecified
* * *


### function to transform new definitions to a X vector

In [18]:
def get_prediction(definition):
    text_x = vectorizer.transform([definition]).toarray()
    return(sgd.predict(text_x))

### view predicted classifications of new definitions

In [20]:
print("Model predictions for 'machine learning' definitions:")
for mld in ml_defs:
    print(get_prediction(mld))
    
print("* * *")
print("Model predictions for 'AI' definitions:")
for aid in ai_defs:
    print(get_prediction(aid))

print("* * *")
print("Model predictions for other definitions (kitten, piano, widget):")
for otherd in other_defs:
    print(get_prediction(otherd))

Model predictions for 'machine learning' definitions:
[False]
[False]
[False]
* * *
Model predictions for 'AI' definitions:
[False]
[False]
* * *
Model predictions for other definitions (kitten, piano, widget):
[False]
[False]
[False]
