In [60]:
import numpy as np
import pandas as pd
import textstat
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# Basic features and readability of scores

## Introduction to NLP feature engineering

One method of feature engineering that I'm familiar with is One-hot encoding. This is really only feasible for small datasets.

In [2]:
df = pd.DataFrame(data = ['Male', 'Male', 'Female'], columns= ['Sex'])
df

Unnamed: 0,Sex
0,Male
1,Male
2,Female


In [3]:
pd.get_dummies(df, columns= ['Sex'])

Unnamed: 0,Sex_Female,Sex_Male
0,0,1
1,0,1
2,1,0


## Basic feature extraction

- Character count
- Word count
- Hashtag count

In [4]:
# Character Count
sentence = 'I don\'t know'
len(sentence)

12

In [5]:
df['num_chars'] = df['Sex'].apply(len)

In [6]:
df

Unnamed: 0,Sex,num_chars
0,Male,4
1,Male,4
2,Female,6


In [7]:
# Number of words
words = sentence.split()
len(words)

3

In [8]:
def word_counts(string):
    word = string.split()
    return len(word)

df['word_count'] = df['Sex'].apply(word_counts)

In [9]:
df

Unnamed: 0,Sex,num_chars,word_count
0,Male,4,1
1,Male,4,1
2,Female,6,1


In [10]:
# Finding hastags
def hashtag_count(string):
    words = string.split()

    hashtag = [word for word in words if word.startswith('#')]
    return len(hashtag)

In [11]:
russia = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/russian_tweets.csv')

In [12]:
russia.head()

Unnamed: 0.1,Unnamed: 0,content
0,127447,LIVE STREAM VIDEO=> Donald Trump Rallies in Co...
1,123642,Muslim Attacks NYPD Cops with Meat Cleaver. Me...
2,226970,.@vfpatlas well that's a swella word there (di...
3,138339,RT wehking_pamela: Bobby_Axelrod2k MMFlint don...
4,161610,Жители обстреливаемых районов Донецка проводят...


In [13]:
russia['word_count'] = russia['content'].apply(word_counts)
russia.head()

Unnamed: 0.1,Unnamed: 0,content,word_count
0,127447,LIVE STREAM VIDEO=> Donald Trump Rallies in Co...,15
1,123642,Muslim Attacks NYPD Cops with Meat Cleaver. Me...,15
2,226970,.@vfpatlas well that's a swella word there (di...,12
3,138339,RT wehking_pamela: Bobby_Axelrod2k MMFlint don...,14
4,161610,Жители обстреливаемых районов Донецка проводят...,13


In [14]:
russia['hashtags'] = russia['content'].apply(hashtag_count)
russia.head()

Unnamed: 0.1,Unnamed: 0,content,word_count,hashtags
0,127447,LIVE STREAM VIDEO=> Donald Trump Rallies in Co...,15,0
1,123642,Muslim Attacks NYPD Cops with Meat Cleaver. Me...,15,1
2,226970,.@vfpatlas well that's a swella word there (di...,12,0
3,138339,RT wehking_pamela: Bobby_Axelrod2k MMFlint don...,14,0
4,161610,Жители обстреливаемых районов Донецка проводят...,13,1


In [15]:
russia['word_count'].mean()

13.085

## Readability tests

Readability tests are used to identifiy the readability of each piece of text. 
Tests include: 
- Flesch reading ease
- Gunning fog index
- Simple Measure fo Gobbledygook (SMOG)
- Dale-Chall score

In [16]:
text1 = 'Test this sentence.'
text2 = 'I hope this is a more complicated string of characters.'

In [17]:
textstat.flesch_reading_ease(text1)

93.81

In [18]:
textstat.flesch_reading_ease(text2)

69.79

In [19]:
ted = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/ted.csv')

In [20]:
ted.head()

Unnamed: 0,transcript,url
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...


In [21]:
ted['score'] = pd.Series()

  ted['score'] = pd.Series()


In [22]:
for i in range(ted.shape[0]):
    ted.iloc[i, -1] = textstat.flesch_reading_ease(ted.iloc[i,0])

In [23]:
ted.head()

Unnamed: 0,transcript,url,score
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...,82.65
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...,77.06
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...,63.32
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...,64.07
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...,65.46


In [24]:
ted['score'].mean()

64.94851999999999

# Text Preprocessing, POS tagging, and NER

## Tokenization and Lemmatization

Text needs to be machine fiendly. reduction, REDUCING, Reduce all invoke the same meaning but machines won't see this distinction.

Tokenization splits sentences into it's base parts. 

Lemmatization reduces words to their base words.

In [25]:
text = ted.iloc[0,0]

In [26]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [27]:
tokens = [token.text for token in doc]
tokens

['We',
 "'re",
 'going',
 'to',
 'talk',
 '—',
 'my',
 '—',
 'a',
 'new',
 'lecture',
 ',',
 'just',
 'for',
 'TED',
 '—',
 'and',
 'I',
 "'m",
 'going',
 'show',
 'you',
 'some',
 'illusions',
 'that',
 'we',
 "'ve",
 'created',
 'for',
 'TED',
 ',',
 'and',
 'I',
 "'m",
 'going',
 'to',
 'try',
 'to',
 'relate',
 'this',
 'to',
 'happiness',
 '.',
 'What',
 'I',
 'was',
 'thinking',
 'about',
 'with',
 'happiness',
 'is',
 ',',
 'what',
 'gives',
 'happiness',
 '—',
 'or',
 'happiness',
 ',',
 'which',
 'I',
 'equate',
 'with',
 'joy',
 'in',
 'my',
 'particular',
 'area',
 ',',
 'and',
 'I',
 'think',
 'there',
 "'s",
 'something',
 'very',
 'fundamental',
 '.',
 'And',
 'I',
 'was',
 'thinking',
 'about',
 'this',
 '.',
 'And',
 'it',
 "'s",
 'in',
 'terms',
 'of',
 'both',
 'illusions',
 'and',
 'movies',
 'that',
 'we',
 'go',
 'see',
 'and',
 'jokes',
 'and',
 'magic',
 'shows',
 'is',
 'that',
 'there',
 "'s",
 'something',
 'about',
 'these',
 'things',
 'where',
 'our',
 'exp

In [28]:
lemmas = [token.lemma_ for token in doc]
lemmas

['we',
 'be',
 'go',
 'to',
 'talk',
 '—',
 'my',
 '—',
 'a',
 'new',
 'lecture',
 ',',
 'just',
 'for',
 'TED',
 '—',
 'and',
 'I',
 'be',
 'go',
 'show',
 'you',
 'some',
 'illusion',
 'that',
 'we',
 "'ve",
 'create',
 'for',
 'TED',
 ',',
 'and',
 'I',
 'be',
 'go',
 'to',
 'try',
 'to',
 'relate',
 'this',
 'to',
 'happiness',
 '.',
 'what',
 'I',
 'be',
 'think',
 'about',
 'with',
 'happiness',
 'be',
 ',',
 'what',
 'give',
 'happiness',
 '—',
 'or',
 'happiness',
 ',',
 'which',
 'I',
 'equate',
 'with',
 'joy',
 'in',
 'my',
 'particular',
 'area',
 ',',
 'and',
 'I',
 'think',
 'there',
 'be',
 'something',
 'very',
 'fundamental',
 '.',
 'and',
 'I',
 'be',
 'think',
 'about',
 'this',
 '.',
 'and',
 'it',
 'be',
 'in',
 'term',
 'of',
 'both',
 'illusion',
 'and',
 'movie',
 'that',
 'we',
 'go',
 'see',
 'and',
 'joke',
 'and',
 'magic',
 'show',
 'be',
 'that',
 'there',
 'be',
 'something',
 'about',
 'these',
 'thing',
 'where',
 'our',
 'expectation',
 'be',
 'violate

In [29]:
lemma_text = ' '.join(lemmas)

In [30]:
lemma_text

'we be go to talk — my — a new lecture , just for TED — and I be go show you some illusion that we \'ve create for TED , and I be go to try to relate this to happiness . what I be think about with happiness be , what give happiness — or happiness , which I equate with joy in my particular area , and I think there be something very fundamental . and I be think about this . and it be in term of both illusion and movie that we go see and joke and magic show be that there be something about these thing where our expectation be violate in some sort of pleasing way . you go see a movie . and it have an unexpected twist — something that you do n\'t expect — and you find a joyful experience . you look at those sort of illusion in my book and it be not as what you \'d expect . and there be something joyful about it . and it be the same thing with joke and all these sort of thing . so , what I be go to try and do in my lecture be go a little bit far and see if I can violate your expectation in a

## Text cleaning

We might want to remove punctuations, stop words, whitespaces, emojis etc. 

In [31]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [32]:
stop_lemmas = [lemma for lemma in lemmas if lemma not in stopwords]
print(len(lemmas))
print(len(stop_lemmas))

2032
918


In [33]:
# Clean the full Ted dataset:
def clean_full(text):
    nlp = spacy.load('en_core_web_sm')
    stopwords = spacy.lang.en.stop_words.STOP_WORDS

    doc = nlp(text)

    lemmas = [token.lemma_ for token in doc]
    clean_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]

    return ' '.join(clean_lemmas)

ted['transcript'] = ted['transcript'].apply(clean_full)

In [34]:
ted.head()

Unnamed: 0,transcript,url,score
0,talk new lecture TED I illusion create TED I t...,https://www.ted.com/talks/al_seckel_says_our_b...,82.65
1,representation brain brain break left half log...,https://www.ted.com/talks/aaron_o_connell_maki...,77.06
2,great honor today share Digital Universe creat...,https://www.ted.com/talks/carter_emmart_demos_...,63.32
3,passion music technology thing combination thi...,https://www.ted.com/talks/jared_ficklin_new_wa...,64.07
4,use want computer new program programming requ...,https://www.ted.com/talks/jeremy_howard_the_wo...,65.46


# Part-of-speech tagging

This allows us to identify nouns, pronouns, adverbs, verbs, etc.

In [35]:
nlp

string = 'Dillon is an okay guitarist'
doc = nlp(string)

In [36]:
pos = [(token.text, token.pos_) for token in doc]
print(pos)

[('Dillon', 'PROPN'), ('is', 'AUX'), ('an', 'DET'), ('okay', 'ADJ'), ('guitarist', 'NOUN')]


In [37]:
pos[1].count('AUX')

1

## Named entity recognition

Good for search algorithms. Who, what, where, when, why.

In [38]:
text = 'Dillon Dilly is not a software engineer and does not work at Google. He might live in Ireland.'

In [39]:
nlp

doc = nlp(text)

names = [(ent.text, ent.label_) for ent in doc.ents]
names

[('Dillon Dilly', 'PERSON'), ('Ireland', 'GPE')]

# N-Gram Models

## Bag of words model

In [40]:
corpus = pd.Series([
    'The lion is the king of the jungle',
    'Lions have lifespans of a decade',
    'The lion is an endangered species'
])

In [41]:
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(corpus)
matrix.toarray()

array([[0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 3],
       [0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1]])

The vector above takes each word in corpus, alphabetizes it, then assigns a count for each instance. [0,0,0,1,1,1,0,1,0,1,0,3] corresponds to three 'the', for example. 

In [42]:
pd.DataFrame(data = matrix.toarray(), columns= vectorizer.get_feature_names())

Unnamed: 0,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0,0,1,0,0,1,1


## Building a BoW Naive Bayes Classifier

In [43]:
reviews = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/movie_reviews_clean.csv')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(reviews['review'], reviews['sentiment'], test_size = 0.3)

In [45]:
vectorizer = CountVectorizer(lowercase = True, stop_words = 'english')
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [46]:
clf = MultinomialNB()
clf.fit(X_train_bow, y_train)
y_preds = clf.predict(X_test_bow)

clf.score(X_test_bow, y_test)

0.7933333333333333

## Building n-gram models

Bag of words would not distinguish between: 

'The movie was good and not boring'

and 

'The movie was not good and boring'

Context is lost. 

In [47]:
n_grams = CountVectorizer(ngram_range=(1,3))
ng = n_grams.fit_transform(corpus)

In [48]:
ng.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 3, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]])

# TF-IDF and Similarity Scores

In [55]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
pd.DataFrame(data = tfidf_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0.0,0.0,0.0,0.0,0.254347,0.334435,0.334435,0.0,0.254347,0.0,0.254347,0.0,0.76304
1,0.0,0.467351,0.0,0.467351,0.0,0.0,0.0,0.467351,0.0,0.467351,0.355432,0.0,0.0
2,0.459548,0.0,0.459548,0.0,0.349498,0.0,0.0,0.0,0.349498,0.0,0.0,0.459548,0.349498


## Cosine Similarity

Cosine Similarity use the cosine rule between two vectors to determine how similar they are. Below, sentence one and two are similar to each other. Sentence two and three are not related at all.

In [57]:
cosine_similarity(tfidf_matrix, tfidf_matrix)

array([[1.        , 0.09040303, 0.44446827],
       [0.09040303, 1.        , 0.        ],
       [0.44446827, 0.        , 1.        ]])

## Building a plot line based recommender

In [64]:
movies = pd.read_csv('/Users/Dillon/OneDrive/Documents/DataCampML/movie_overviews.csv')

In [65]:
movies.dropna(axis = 0, inplace= True)

In [69]:
movie_plots = movies['tagline']

In [70]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(movie_plots)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [75]:
movies.drop(labels=['id', 'overview'], axis = 1, inplace = True)

In [77]:
indices = pd.Series(movies.index, index = movies['title']).drop_duplicates()

In [80]:
def get_recommendations(title, cosine_sim, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [83]:
get_recommendations('Jumanji', cosine_sim, indices)

164                            Nine Months
1352                          Home Alone 3
6330                      Tristan & Isolde
3012       Braddock: Missing in Action III
1912                           First Blood
1918                              Rocky II
2697                            Volunteers
6679                  In the Land of Women
2845    The Flintstones in Viva Rock Vegas
8519                         Cheap Thrills
Name: title, dtype: object

In [85]:
idx = indices['Jumanji']
idx

1

In [89]:
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores

[(0, 0.0),
 (1, 0.9999999999999999),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.16415893412316887),
 (23, 0.06956552685539989),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.09916697059712323),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.07679167467555116),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.07660304789367058),
 (60, 0.07852118719001788),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),

In [94]:
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
sim_scores

[(1, 0.9999999999999999),
 (137, 0.4518977486629421),
 (1110, 0.4518977486629421),
 (4961, 0.3894291795954592),
 (2475, 0.37673255252472193),
 (1576, 0.3672227997244025),
 (1582, 0.36631610051048347),
 (2220, 0.3383271362447783),
 (5230, 0.31145919332912214),
 (2338, 0.30902027874318744),
 (6603, 0.2938363497209301),
 (2464, 0.28849902585847864),
 (2893, 0.2846827282463925),
 (1069, 0.2802939586700409),
 (1475, 0.2802939586700409),
 (1920, 0.2802939586700409),
 (1978, 0.2802939586700409),
 (2834, 0.2802939586700409),
 (3608, 0.2802939586700409),
 (3651, 0.2802939586700409),
 (5307, 0.2802939586700409),
 (6223, 0.2802939586700409),
 (4821, 0.2745349918177304),
 (6491, 0.2711898333589473),
 (2775, 0.2578730120293544),
 (470, 0.256860291820153),
 (6614, 0.256860291820153),
 (1514, 0.2558428023148334),
 (5976, 0.2502832738415275),
 (1023, 0.24900197007352326),
 (2531, 0.24900197007352326),
 (5243, 0.24900197007352326),
 (6868, 0.24900197007352326),
 (3133, 0.24874172454358212),
 (2936, 0.2

In [96]:
sim_scores = sim_scores[1:11]
sim_scores

[(1110, 0.4518977486629421),
 (4961, 0.3894291795954592),
 (2475, 0.37673255252472193),
 (1576, 0.3672227997244025),
 (1582, 0.36631610051048347),
 (2220, 0.3383271362447783),
 (5230, 0.31145919332912214),
 (2338, 0.30902027874318744),
 (6603, 0.2938363497209301)]

In [99]:
movie_recom = [i[0] for i in sim_scores]
movie_recom

[1110, 4961, 2475, 1576, 1582, 2220, 5230, 2338, 6603]

In [100]:
movies['title'].iloc[movie_recom]

1352                          Home Alone 3
6330                      Tristan & Isolde
3012       Braddock: Missing in Action III
1912                           First Blood
1918                              Rocky II
2697                            Volunteers
6679                  In the Land of Women
2845    The Flintstones in Viva Rock Vegas
8519                         Cheap Thrills
Name: title, dtype: object