In [1]:
import pandas as pd 
import util
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#### So first lets just get the data in an easy to work with format

In [2]:
df = util.get_full_data_frame()

In [4]:
for source in df['source'].unique():
    
    df_source = df[df['source'] == source]
    print('shape',source,' data: ', df_source.shape)

shape yelp  data:  (1000, 3)
shape amazon  data:  (1000, 3)
shape imdb  data:  (748, 3)


#### so we've got the dataset in a data frame. Great. Now to decide on the method. In this example we are going to use a BOW approach (not the best, but a good fundamental) to do this we can use scikit learns CountVectorizer which creates a vocabulary that we will use to build feature vectors

In [40]:
sentences = ['John likes ice cream','John hates ice cream']
vectorizer = CountVectorizer(min_df=0,lowercase=False)
vectorizer.fit(sentences)
print(vectorizer.vocabulary_)

{'John': 0, 'likes': 4, 'ice': 3, 'cream': 1, 'hates': 2}


#### now that we have a vocabulary we can use transfrom to get feature vecots to use in training. the resulting vector is the version of our vocabulary vector with each indecy representing that words frequency in a given sentence

In [41]:
vectorizer.transform(sentences).toarray()[0]

array([1, 1, 0, 1, 1])

#### Now it's time to split our set for training

In [42]:
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train,sentences_test,y_train,y_test = train_test_split(sentences,y,test_size=0.25,random_state=1000)

print('Training set: {} '.format(len(sentences_train)))
print('Testing set: {} '.format(len(sentences_test)))

Training set: 750 
Testing set: 250 


#### Time to vectorize the inputs

In [43]:
# we will create our vocabulary using only the training set
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

#then we create our feature vectors for every sentence. NOTE: scikit takes care of lots of preprocessing for us by
#tokenizing sentences (removing punctuation, special characters....)
#this is one of our HYPER PERAMETERS, we can change how we tokenize
X_train = vectorizer.transform(sentences_train)
X_test =  vectorizer.transform(sentences_test)

#for these reviews the vocabulary is 1714 words which will mean that every feature vector is that long as well (not hot)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [44]:
#the model we will use is Logistic Regression, a simple classifier
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
score = classifier.score(X_test,y_test)

print('Accuracy: ', score)

Accuracy:  0.796




In [47]:
# we got a lofty 79%, not terrible but not amazing
# now lets see how our model matches up against the oterh data sets

for source in df['source'].unique():
    
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values
    X =  vectorizer.transform(sentences)
    
    score = classifier.score(X,y)
    print('Accuracy for ',source,' data: ', score)

Accuracy for  yelp  data:  0.938
Accuracy for  amazon  data:  0.736
Accuracy for  imdb  data:  0.660427807486631


#### okay so it makes sense to get 94% on the set that we used 75% of it for but as you can see it do so hot on the next two sets. However, this sets the baseline to beat