In [13]:
import numpy as np
import pandas as pd

In [14]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('https://gist.githubusercontent.com/fmnobar/88703ec6a1f37b3eabf126ad38c392b8/raw/76b84540ccd4b0b207a6978eb7e9d938275886ff/imdb_labelled.csv')
df

Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1
...,...,...
743,I just got bored watching Jessice Lange take her clothes off!,0
744,"Unfortunately, any virtue in this film's production work was lost on a regrettable script.",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [15]:
df['label'].value_counts()

1    386
0    362
Name: label, dtype: int64

In [16]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [17]:
sample = df.text[0]
sample

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

### Tokenize the text

In [18]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')


sample_tokens = word_tokenize(sample)
sample_tokens[:10]

[nltk_data] Downloading package punkt to /home/karen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['A', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie']

In [19]:
from nltk import bigrams

sample_bitokens = list(bigrams(sample_tokens))
sample_bitokens

[('A', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', 'slow-moving'),
 ('slow-moving', ','),
 (',', 'aimless'),
 ('aimless', 'movie'),
 ('movie', 'about'),
 ('about', 'a'),
 ('a', 'distressed'),
 ('distressed', ','),
 (',', 'drifting'),
 ('drifting', 'young'),
 ('young', 'man'),
 ('man', '.')]

#### Check Frequency of Words

In [20]:
from nltk import FreqDist
sample_freqdist = FreqDist(sample_tokens)
sample_freqdist.most_common(10)

[(',', 4),
 ('very', 3),
 ('A', 1),
 ('slow-moving', 1),
 ('aimless', 1),
 ('movie', 1),
 ('about', 1),
 ('a', 1),
 ('distressed', 1),
 ('drifting', 1)]

In [22]:
# Create a function to accept a text and n and returns top n most common tokens
def top_n(text, n):
    # Create tokens
    tokens = word_tokenize(text)
    
    # Create the frequency distribution
    freqdist = FreqDist(tokens)
    
    # Return the top n most common ones
    return freqdist.most_common(n)

# Try it on the sample
top_n(df.text[1], 10)

[('the', 2),
 ('Not', 1),
 ('sure', 1),
 ('who', 1),
 ('was', 1),
 ('more', 1),
 ('lost', 1),
 ('-', 1),
 ('flat', 1),
 ('characters', 1)]

#### Data Transformation Matrix

In [26]:
# Import the package
from sklearn.feature_extraction.text import CountVectorizer

def create_dtm(series):
    # Create an instance of the class
    cv = CountVectorizer()
    
    # Create a document term matrix from the provided series
    dtm = cv.fit_transform(series)
    
    # Convert the sparse array to a dense array
    dtm = dtm.todense()
    
    # Get column names
    features = cv.get_feature_names()
    
    # Create a dataframe
    dtm_df = pd.DataFrame(dtm, columns = features)
    
    # Return the dataframe
    return dtm_df

# Try the function on the top 5 rows of the df['text']
create_dtm(df.text.head())

Unnamed: 0,about,acting,aimless,almost,and,angles,anything,artiness,as,attempting,...,trying,very,walked,was,when,white,who,whom,with,young
0,1,0,1,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
2,0,1,0,1,3,1,0,1,1,1,...,0,0,0,1,0,1,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


#### Feature Importance

In [28]:
# Import logistic regression
from sklearn.linear_model import LogisticRegression

def top_n_tokens(text, sentiment, n):
    # Create an instance of the class
    lgr = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    
    # create the DTM
    dtm = cv.fit_transform(text)
    
    # Fit the logistic regression model
    lgr.fit(dtm, sentiment)
    
    # Get the coefficients
    coefs = lgr.coef_[0]
    
    # Create the features / column names
    features = cv.get_feature_names()
    
    # create the dataframe
    df = pd.DataFrame({'Tokens' : features, 'Coefficients' : coefs})
    
    # Return the largest n
    return df.nlargest(n, 'Coefficients')

# Test it on the df['text']
top_n_tokens(df.text, df.label, 10)

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806


In [29]:
def bottom_n_tokens(text, sentiment, n):
    # Create an instance of the class
    lgr = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    
    # create the DTM
    dtm = cv.fit_transform(text)
    
    # Fit the logistic regression model
    lgr.fit(dtm, sentiment)
    
    # Get the coefficients
    coefs = lgr.coef_[0]
    
    # Create the features / column names
    features = cv.get_feature_names()
    
    # create the dataframe
    df = pd.DataFrame({'Tokens' : features, 'Coefficients' : coefs})
    
    # Return the largest n
    return df.nsmallest(n, 'Coefficients')

# Test it on the df['text']
bottom_n_tokens(df.text, df.label, 10)

Unnamed: 0,Tokens,Coefficients
222,bad,-1.872751
211,awful,-1.334554
2530,stupid,-1.175416
441,cheap,-1.139512
1802,no,-1.137234
893,even,-1.091436
3017,would,-1.047931
3012,worst,-1.039231
2923,waste,-1.038206
1819,nothing,-0.973472


In [36]:
!pip install textblob 

import textblob
from textblob import TextBlob

def polarity_subjectivity(text = sample, print_results = False):
    # Create an instance of TextBlob
    tb = TextBlob(text)
    
    # If the condition is met, print the results, otherwise, return the tuple
    if print_results:
        print(f"Polarity is {round(tb.sentiment[0], 2)} and subjectivity is {round(tb.sentiment[1], 2)}.")
    else:
        return(tb.sentiment[0], tb.sentiment[1])
    
# Test the function on our sample
polarity_subjectivity(sample, print_results = True)

Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Polarity is 0.18 and subjectivity is 0.4.


In [37]:
# Define the first function that counts the number of tokens in a given string
def token_count(string):
    return len(word_tokenize(string))

# Define the second function that applies the token_count function to a given Pandas Series
def series_tokens(series):
    return series.apply(token_count)

# Apply the function to the top 10 rows of the dataframe
series_tokens(df.text.head(10))

0    18
1    21
2    33
3     9
4    22
5    27
6     4
7    17
8     4
9    11
Name: text, dtype: int64

In [38]:
# Define the function
def series_polarity_subjectivity(series):
    return series.apply(polarity_subjectivity)

# Apply to the top 10 rows of the df['text']
series_polarity_subjectivity(df['text'].head(10))

0                                 (0.18, 0.395)
1    (0.014583333333333337, 0.4201388888888889)
2    (-0.12291666666666666, 0.5145833333333333)
3                  (-0.24375000000000002, 0.65)
4                                    (1.0, 0.3)
5                                   (-0.1, 0.5)
6                                   (-0.2, 0.0)
7                     (0.7, 0.6000000000000001)
8                                   (-0.2, 0.5)
9                                    (0.7, 0.8)
Name: text, dtype: object

#### Measure of Complexity — Lexical Diversity

In [39]:
def complexity(string):
    # Create a list of all tokens
    total_tokens = word_tokenize(string)
    
    # Create a set of all tokens (which only keeps unique values)
    unique_tokens = set(word_tokenize(string))
    
    # Return the complexity measure
    return len(unique_tokens) / len(total_tokens)

# Apply to the top 10 rows of the dataframe
df.text.head(10).apply(complexity)

0    0.722222
1    0.952381
2    0.848485
3    1.000000
4    1.000000
5    0.814815
6    1.000000
7    0.941176
8    1.000000
9    0.909091
Name: text, dtype: float64

#### Text Cleanup — Stopwords and Non-Alphabeticals

In [41]:
# Import library
nltk.download('stopwords')
from nltk.corpus import stopwords

# Select only English stopwords
english_stop_words = stopwords.words('english')

# Print the first 20
print(english_stop_words[:20])

[nltk_data] Downloading package stopwords to /home/karen/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [42]:
def stopword_remover(string):
    # Tokenize the string
    tokens = word_tokenize(string)
    
    # Create a list of English stopwords
    english_stopwords = stopwords.words('english')
    
    # Return non-stopwords
    return [w for w in tokens if w.lower() not in english_stopwords]

# Apply to the top 5 rows of our df['text']
df.text.head(5).apply(stopword_remover)

0                                                                                                 [,, ,, slow-moving, ,, aimless, movie, distressed, ,, drifting, young, man, .]
1                                                                                                        [sure, lost, -, flat, characters, audience, ,, nearly, half, walked, .]
2    [Attempting, artiness, black, &, white, clever, camera, angles, ,, movie, disappointed, -, became, even, ridiculous, -, acting, poor, plot, lines, almost, non-existent, .]
3                                                                                                                                            [little, music, anything, speak, .]
4                                                                                                     [best, scene, movie, Gerardo, trying, find, song, keeps, running, head, .]
Name: text, dtype: object