# Machine Learning Intro

In [1]:
import pandas as pd

In [2]:
filepath_dict = {'yelp': 'yelp_labelled.txt',
                'amazon': 'amazon_cells_labelled.txt',
                'imdb': 'imdb_labelled.txt'}

In [3]:
filepath_dict['yelp']

'yelp_labelled.txt'

In [4]:
df_list = []

for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [5]:
df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      

In [6]:
df = pd.concat(df_list)

In [7]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


## yelp Data

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_yelp = df[ df['source']=='yelp']

In [10]:
df_yelp.head(5)

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [11]:
X = df_yelp['sentence'].values

In [12]:
Y = df_yelp['label'].values

In [13]:
X[:5]

array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.',
       'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
       'The selection on the menu was great and so were the prices.'],
      dtype=object)

In [14]:
Y[:5]

array([1, 0, 0, 1, 1])

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

In [16]:
x_train[:10]

array(['We will not be coming back.',
       'We waited for forty five minutes in vain.',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       "I'd rather eat airline food, seriously.",
       "Needless to say, I won't be going back anytime soon.",
       'For that price I can think of a few place I would have much rather gone.',
       'To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.',
       "I promise they won't disappoint.",
       'The decor is nice, and the piano music soundtrack is pleasant.',
       'Will go back next trip out.'], dtype=object)

### Shape the data in an acceptable shape by Regression Model

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
vectorizer = CountVectorizer()

In [19]:
vectorizer.fit(x_train)

CountVectorizer()

In [20]:
transformed_x_train = vectorizer.transform(x_train)

In [21]:
transformed_x_test = vectorizer.transform(x_test)

In [22]:
transformed_x_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

### Train/classify the data using Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
# our model
classifier = LogisticRegression()

In [25]:
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [26]:
score = classifier.score(transformed_x_test, y_test)

In [27]:
score

0.808

## Amazon data

In [28]:
df_amazon = df[ df['source']=='amazon']

In [29]:
df_amazon

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
995,The screen does get smudged easily because it ...,0,amazon
996,What a piece of junk.. I lose more calls on th...,0,amazon
997,Item Does Not Match Picture.,0,amazon
998,The only thing that disappoint me is the infra...,0,amazon


In [30]:
df_amazon.head(5)

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon


In [31]:
X_amazon = df_amazon['sentence'].values

In [32]:
Y_amazon = df_amazon['label'].values

In [33]:
X_amazon[:5]

array(['So there is no way for me to plug it in here in the US unless I go by a converter.',
       'Good case, Excellent value.', 'Great for the jawbone.',
       'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
       'The mic is great.'], dtype=object)

In [34]:
Y_amazon[:5]

array([0, 1, 1, 0, 1])

In [35]:
x_train_amazon, x_test_amazon, y_train_amazon, y_test_amazon = train_test_split(X_amazon, Y_amazon, test_size=0.25, random_state=100)

In [36]:
x_train_amazon[:10]

array(['They work about 2 weeks then break.',
       'I had absolutely no problem with this headset linking to my 8530 Blackberry Curve!',
       'Motorola finally got the voice quality of a bluetooth headset right.',
       'This battery is an excellent bargain!',
       'much better than the hard plastic cases.', 'Good case!.',
       'Not good enough for the price.',
       'A must study for anyone interested in the "worst sins" of industrial design.',
       'i would advise to not purchase this item it never worked very well.',
       'The camera, although rated at an impressive 1.3 megapixels, renders images that fall well below expectations of such a relatively high resolution.'],
      dtype=object)

### Shape the data in an acceptable shape by Regression Model

In [37]:
vectorizer.fit(x_train_amazon)

CountVectorizer()

In [38]:
transformed_x_train_amazon = vectorizer.transform(x_train_amazon)

In [39]:
transformed_x_test_amazon = vectorizer.transform(x_test_amazon)

In [40]:
transformed_x_train_amazon

<750x1551 sparse matrix of type '<class 'numpy.int64'>'
	with 6870 stored elements in Compressed Sparse Row format>

In [41]:
transformed_x_test_amazon

<250x1551 sparse matrix of type '<class 'numpy.int64'>'
	with 1945 stored elements in Compressed Sparse Row format>

### Train/classify the data using Logistic Regression

In [42]:
classifier.fit(transformed_x_train_amazon, y_train_amazon)

LogisticRegression()

In [43]:
score_amazon = classifier.score(transformed_x_test_amazon, y_test_amazon)

In [44]:
score_amazon

0.82

## imdb Data

In [45]:
df_imdb = df[ df['source']=='imdb']

In [46]:
df_imdb

Unnamed: 0,sentence,label,source
0,"A very, very, very slow-moving, aimless movie ...",0,imdb
1,Not sure who was more lost - the flat characte...,0,imdb
2,Attempting artiness with black & white and cle...,0,imdb
3,Very little music or anything to speak of.,0,imdb
4,The best scene in the movie was when Gerardo i...,1,imdb
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [47]:
df_imdb.head(5)

Unnamed: 0,sentence,label,source
0,"A very, very, very slow-moving, aimless movie ...",0,imdb
1,Not sure who was more lost - the flat characte...,0,imdb
2,Attempting artiness with black & white and cle...,0,imdb
3,Very little music or anything to speak of.,0,imdb
4,The best scene in the movie was when Gerardo i...,1,imdb


In [48]:
X_imdb = df_imdb['sentence'].values

In [49]:
Y_imdb = df_imdb['label'].values

In [50]:
X_imdb[:5]

array(['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
       'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
       'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
       'Very little music or anything to speak of.  ',
       'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  '],
      dtype=object)

In [51]:
Y_imdb[:5]

array([0, 0, 0, 0, 1])

In [52]:
x_train_imdb, x_test_imdb, y_train_imdb, y_test_imdb = train_test_split(X_imdb, Y_imdb, test_size=0.25, random_state=100)

In [53]:
x_train_imdb[:10]

array(['The results, well, are a shame.  ',
       'Not much dialogue, not much music, the whole film was shot as elaborately and aesthetically like a sculpture.  ',
       'She carries the movie well.  ',
       'Now we were chosen to be tortured with this disgusting piece of blatant American propaganda.  ',
       'THERE IS NO PLOT OR STORYLINE!!  ',
       'There is, however, some pretty good acting (at least, for this type of film).  ',
       "The movie is not completely perfect but 'Titta Di Girolamo' will stay with you for a long time after the vision of the movie.  ",
       'It just blew.  ', 'The movie seemed a little slow at first.  ',
       "It's a feel-good film and that's how I felt when I came out of the cinema!  "],
      dtype=object)

### Shape the data in an acceptable shape by Regression Model

In [54]:
vectorizer.fit(x_train_imdb)

CountVectorizer()

In [55]:
transformed_x_train_imdb = vectorizer.transform(x_train_imdb)

In [56]:
transformed_x_test_imdb = vectorizer.transform(x_test_imdb)

In [57]:
transformed_x_train_imdb

<561x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 8495 stored elements in Compressed Sparse Row format>

In [58]:
transformed_x_test_imdb

<187x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 2310 stored elements in Compressed Sparse Row format>

### Train/classify the data using Logistic Regression

In [59]:
classifier.fit(transformed_x_train_imdb, y_train_imdb)

LogisticRegression()

In [60]:
score_imdb = classifier.score(transformed_x_test_imdb, y_test_imdb)

In [61]:
score_imdb

0.7379679144385026