## Movie Comments Analysis.

In [37]:
# Import libraries.
from pyforest import *

In [38]:
# Make width of the column viewable
pd.set_option('display.max_colwidth', None)


<IPython.core.display.Javascript object>

In [39]:
# Import and get a snippet of the data.
df = pd.read_csv("imdb_labelled.csv")
df.head(10)

<IPython.core.display.Javascript object>

Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1
5,"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.",0
6,Wasted two hours.,0
7,"Saw the movie today and thought it was a good effort, good messages for kids.",1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the science teacher.,1


- There only contains two columns i.e text and label.

In [40]:
df['label'].unique()

array([0, 1])

- There only exists two values of labels:
- '0' to represent a negative sentiment.
- '1' to represent a positive sentiment.

In [41]:
df['label'].value_counts()

1    386
0    362
Name: label, dtype: int64

In [42]:
# Get first text in the dataset.
sample = df['text'][0]
print(type(sample))
sample

<class 'str'>


'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [43]:
# Breakdown large segments of data into substrings - TOKENIZATION.
# Create a tokenizer - function that generates tokens.
# Import package
from nltk import word_tokenize
nltk.download('punkt')

sample_tokens = word_tokenize(sample)
sample_tokens

<IPython.core.display.Javascript object>

[nltk_data] Downloading package punkt to /home/njogu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [44]:
len(sample_tokens)

18

### Implement a bigrams (Grouping adjacent tokens)

In [45]:
# Import Bigrams Library.
from nltk import bigrams

# create the bigrams
bitokens = list(bigrams(sample_tokens))
bitokens

[('A', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', 'slow-moving'),
 ('slow-moving', ','),
 (',', 'aimless'),
 ('aimless', 'movie'),
 ('movie', 'about'),
 ('about', 'a'),
 ('a', 'distressed'),
 ('distressed', ','),
 (',', 'drifting'),
 ('drifting', 'young'),
 ('young', 'man'),
 ('man', '.')]

In [46]:
# Check frequency of tokens.
# Import Frequency Distribution library.
from nltk import FreqDist

sample_freq = FreqDist(sample_tokens)
sample_freq

FreqDist({',': 4, 'very': 3, 'A': 1, 'slow-moving': 1, 'aimless': 1, 'movie': 1, 'about': 1, 'a': 1, 'distressed': 1, 'drifting': 1, ...})

In [47]:
# most frequent tokens.
sample_freq.most_common(10)

[(',', 4),
 ('very', 3),
 ('A', 1),
 ('slow-moving', 1),
 ('aimless', 1),
 ('movie', 1),
 ('about', 1),
 ('a', 1),
 ('distressed', 1),
 ('drifting', 1)]

In [48]:
# Create a function that does tokenization and checks their frequency distribution
# given a text and returns the top n common tokens.

def top_n(text, n):
    # Create tokens.
    tokens= word_tokenize(text)

    # Create frequency distribution.
    token_freq = FreqDist(tokens)

    # return top n tokens.
    return token_freq.most_common(n)

# test the function
top_n(df['text'][2],10)


[('and', 3),
 ('the', 3),
 ('-', 2),
 ('Attempting', 1),
 ('artiness', 1),
 ('with', 1),
 ('black', 1),
 ('&', 1),
 ('white', 1),
 ('clever', 1)]

### Implement Document-Term Matrix (DTM)
- DTM - Indicates how many times a token apears in a sentence or statement.

In [58]:
# import library.
from sklearn.feature_extraction.text import CountVectorizer

# Create a DTM function.
def create_dtm(series):
    # instantiate class
    count_vect = CountVectorizer()

    # create a DTM from the provided series.
    dtm = count_vect.fit_transform(series)
    
    # the dtm creates a sparse array. 
    # Therefore, we convert it to a dense array.
    dtm = dtm.todense()

    # Get column names.
    cols = count_vect.get_feature_names()

    # create a dataframe from the matrix.
    dtm_df = pd.DataFrame(dtm, columns=cols)

    # Return the dataframe.
    return dtm_df

# Try the dtm function.
create_dtm(df.text.head(10))

<IPython.core.display.Javascript object>

Unnamed: 0,about,acting,aimless,almost,and,angles,anything,art,artiness,as,...,walked,was,wasted,when,white,who,whom,with,works,young
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,1,1,0,0,0
2,0,1,0,1,3,1,0,0,1,1,...,0,1,0,0,1,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Convert the functions into a Machine Learning model.

In [59]:
# Create a function that takes in 'text', 'sentiment/(label)' and 'n' - 
# to get top 'n' important tokens.
# There tokens/features shall be used to predict the 'sentiment' of the .tex'.
# We shall use Logistic Regression.
# Import library.

from sklearn.linear_model import LogisticRegression

def top_n_tokens(text, sentiment, n):
    # Instantiate class.
    logi_reg = LogisticRegression(max_iter=2600, random_state=254)
    cont_vect = CountVectorizer()

    # Create DTM
    dtm = cont_vect.fit_transform(text)

    # Fit the Model.
    logi_reg.fit(dtm, sentiment)

    # Obtain coefficients.
    coefs = logi_reg.coef_[0]

    # Create columns
    cols = cont_vect.get_feature_names()

    # Create a dataframe
    df = pd.DataFrame({'Tokens': cols, 'Coefficients': coefs})

    # Return largest n
    return df.nlargest(n, 'Coefficients')

# Test the function
top_n_tokens(df.text, df.label, 12)
    


<IPython.core.display.Javascript object>

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806


- Note that the Tokens with the highest coefficients indicate a positive sentiment.
- Therefore, they are the most important features for they indicate a strong positive sentiment.

In [60]:
# We look at the least important features (Those that convey negative sentiment)
from sklearn.linear_model import LogisticRegression

def bottom_n_tokens(text, sentiment, n):
    # Instantiate class.
    logi_reg = LogisticRegression(max_iter=2600, random_state=254)
    cont_vect = CountVectorizer()

    # Create DTM
    dtm = cont_vect.fit_transform(text)

    # Fit the Model.
    logi_reg.fit(dtm, sentiment)

    # Obtain coefficients.
    coefs = logi_reg.coef_[0]

    # Create columns
    cols = cont_vect.get_feature_names()

    # Create a dataframe
    df = pd.DataFrame({'Tokens': cols, 'Coefficients': coefs})

    # Return largest n
    return df.nsmallest(n, 'Coefficients')

# Test the function
top_n_tokens(df.text, df.label, 12)

<IPython.core.display.Javascript object>

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806
