# Movie Reviews

In [2]:
import numpy as np
import pandas as pd

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

## Dataset

In [180]:
s1 = "this was a good movie"
s2 = "this was boring"
s3 = "a good action movie"
s4 = "I like this movie"
s5 = "boring movie and bad acting"
s6 = "bad boring movie"
s7 = "good acting"
s8 = "the worst movie I have ever seen"
sentences = [s1,s2,s3,s4,s5,s6,s7,s8]

# 0 means bad and 1 means good
l1 = 1
l2 = 0
l3 = 1
l4 = 1
l5 = 0
labels = [l1,l2,l3,l4,l5]

## Vectorize

In [178]:
def getCountVec(S):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(S).toarray()
    df = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    print(df.head(25))
    return X

In [179]:
def getTfidfVec(S):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(S).toarray()

    df = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    print(df.head(25))
    return X

In [171]:
Count = getCountVec(sentences)

   acting  action  and  bad  boring  ever  good  have  like  movie  seen  the  \
0       0       0    0    0       0     0     1     0     0      1     0    0   
1       0       0    0    0       1     0     0     0     0      0     0    0   
2       0       1    0    0       0     0     1     0     0      1     0    0   
3       0       0    0    0       0     0     0     0     1      1     0    0   
4       1       0    1    1       1     0     0     0     0      1     0    0   
5       0       0    0    1       1     0     0     0     0      1     0    0   
6       1       0    0    0       0     0     1     0     0      0     0    0   
7       0       0    0    0       0     1     0     1     0      1     1    1   

   this  was  worst  
0     1    1      0  
1     1    1      0  
2     0    0      0  
3     1    0      0  
4     0    0      0  
5     0    0      0  
6     0    0      0  
7     0    0      1  


In [172]:
TFIDF = getTfidfVec(sentences)

     acting   action       and       bad    boring      ever      good  \
0  0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.511617   
1  0.000000  0.00000  0.000000  0.000000  0.546934  0.000000  0.000000   
2  0.000000  0.75107  0.000000  0.000000  0.000000  0.000000  0.543168   
3  0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.470158  0.00000  0.560996  0.470158  0.405708  0.000000  0.000000   
5  0.000000  0.00000  0.000000  0.690041  0.595449  0.000000  0.000000   
6  0.757092  0.00000  0.000000  0.000000  0.000000  0.000000  0.653308   
7  0.000000  0.00000  0.000000  0.000000  0.000000  0.436448  0.000000   

       have     like     movie      seen       the      this       was  \
0  0.000000  0.00000  0.353517  0.000000  0.000000  0.511617  0.592892   
1  0.000000  0.00000  0.000000  0.000000  0.000000  0.546934  0.633819   
2  0.000000  0.00000  0.375318  0.000000  0.000000  0.000000  0.000000   
3  0.000000  0.75107  0.375318  0.000

## Prepare data for train and test

In [173]:
train_size = len(labels)

## Model training & predictions

In [174]:
# define the model
model = SVC(probability=True)

# train/fit the model
model.fit(Count[:train_size],labels[:train_size])

# predict
pred = model.predict(Count[train_size:])

# print
print([[s,p] for s,p in zip(sentences[train_size:], pred)])

[['bad boring movie', 0], ['good acting', 1], ['the worst movie I have ever seen', 1]]


# Tips for A6
--------

### 1d) Convert Text data into vector 

We will now create a `CountVectorizer` object to transform the text data into vectors with numerical values. 

To do so, we will initialize a `CountVectorizer` object, and name it as `vectorizer`.

We need to pass 4 arguments to initialize a CountVectorizer:
  1. `analyzer`: `'word'` 
          Specify to analyze data from word-level.
  2. `max_features`: `2000`
          Set a max number of unique words.
  3. `tokenizer`: `word_tokenize`
          Set to tokenize the text data by using the word_tokenizer from NLTK .
  4. `stop_words`: `stopwords.words('english')`
          Set to remove all stopwords in English. We do this since they generally don't provide useful discriminative information.

**Tip**: Make a single call to CountVectorizer(dont call fit or fit transform)

### 1g) Defining the train & test sets

We first set 80% of the data as the training set to train an SVM classifier. We will then test the learnt classifier on the remaining 20% of data samples (test set). (Reminder: For this homework assignment, we've already shuffled the data)

- Calculate the number of training data samples (80% of total) and store it in `num_training`
- Calculate the number of test data samples (20% of total) and store it in `num_testing`
- Make sure both of these variables are of type `int`

**Tip**: Make sure num_training and num_testing are integers!! and also make sure they sum up to the total data size.



In [177]:
a = [1]*10
train_size = int(len(a)*0.95)
test_size = int(len(a)*0.05)

print(train_size, test_size)
print(len(a)==train_size + test_size)

9 0
False
