In [7]:
import pandas as pd
import numpy as np

# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angellicacsavage/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/angellicacsavage/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/angellicacsavage/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df = pd.read_csv('PowerRangers.csv')
df.head(65)

Unnamed: 0,season_title,episode_num,episode_title,air_date,IMDB_rating,total_votes,desc,season_num,producer
0,Mighty Morphin (Season 1),1,Day of the Dumpster,8/28/1993,7.4,687,Following the accidental release of long-impri...,1,Saban
1,Mighty Morphin (Season 1),2,High Five,9/4/1993,6.9,564,Rita plans to trap the Rangers in a time trap ...,1,Saban
2,Mighty Morphin (Season 1),3,Teamwork,9/8/1993,7.3,546,Trini and Kimberly set up a petition to clean ...,1,Saban
3,Mighty Morphin (Season 1),4,A Pressing Engagement,9/9/1993,6.9,535,Jason is trying to break the bench press recor...,1,Saban
4,Mighty Morphin (Season 1),5,Different Drum,9/10/1993,6.6,516,Kimberly's deaf friend feels out of place in d...,1,Saban
...,...,...,...,...,...,...,...,...,...
60,Mighty Morphin (Season 2),1,The Mutiny: Part I,7/21/1994,8.1,450,Lord Zedd is introduced as the new arch-rival ...,2,Saban
61,Mighty Morphin (Season 2),2,The Mutiny: Part II,7/29/1994,8.0,449,The Rangers' problems are further jeopardized ...,2,Saban
62,Mighty Morphin (Season 2),3,The Mutiny: Part III,8/5/1994,8.1,444,"The Rangers are empowered with new zords, whil...",2,Saban
63,Mighty Morphin (Season 2),4,The Wanna-Be Ranger,9/13/1994,7.4,430,Zordon informs the rangers that he's going to ...,2,Saban


In [9]:
df['desc'][60]

'Lord Zedd is introduced as the new arch-rival of the Power Rangers. Angered by the failure of Rita, he imprisons her in the Dumpster again and sets about defeating the Power Rangers by immobilizing the zords.'

In [12]:
def make_lower(a_string):
    return a_string.lower()

In [13]:
def remove_stopwords(a_string):
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

In [14]:
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word)
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [15]:
def convert_pos(pos):
    if pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lem_with_pos_tag(a_string):
    # Initalize our Lemmer
    lemmatizer = WordNetLemmatizer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Get the word and pos_tag for each of the words. 
    tagged_words = nltk.pos_tag(words)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in tagged_words:
        
        # The word is the first element in the tuple
        the_word = word[0]
        
        # The pos_tag is the second element in the tuple
        the_pos_tag = word[1]
        
        # Convert the pos_tag into the format the lemmatizer accepts
        the_pos_tag = convert_pos(the_pos_tag)
        
        # Lemmatize the word with the pos_tag
        lemmed_word = lemmatizer.lemmatize(the_word, the_pos_tag)
        
        # Append stemmed word to our valid_words
        valid_words.append(lemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [16]:
def text_pipeline(input_string):
    #input_string = make_lower(input_string)
    #input_string = remove_punctuation(input_string)
    #input_string = lem_with_pos_tag(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string

In [17]:
df['desc_clean'] = df['desc']
df['desc_clean'] = df['desc_clean'].apply(text_pipeline)

In [18]:
doc = nlp(df['desc_clean'][3])
print([(X.text, X.label_) for X in doc.ents])

[('Bulk', 'PERSON'), ('First', 'ORDINAL'), ("Juice Bar 's", 'PERSON'), ('Ernie', 'PERSON'), ('Jason', 'PERSON')]


In [26]:
df['desc_clean'][3]

"Jason trying break bench press record set Bulk , keeps getting distracted unable beat record . First , coach , Juice Bar 's owner Ernie , loses count . Then Kimberly blows enormous bubble bubble gum draws crowd 's attention away Jason , Zack skateboards bumps accident , results gum splattering faces Jason losing concentration . During break attempt break bench press record , Jason admits unable break record shaken ..."

In [30]:
bad_list = {"Juice Bar 's" : "", "Rita traps Power Rangers" : "Rita"}

In [29]:
def sparce(input_string):
    doc = nlp(input_string)
    keep_list = ['PERSON', 'ORG', 'GROUP', 'GPE']
    return_string = ','.join([X.text for X in doc.ents])
    return [(X, X.label_) for X in doc.ents]
sparce(df['desc_clean'][3])

[(Bulk, 'PERSON'),
 (First, 'ORDINAL'),
 (Juice Bar 's, 'PERSON'),
 (Ernie, 'PERSON'),
 (Jason, 'PERSON')]

In [32]:
def sparce(input_string):
    doc = nlp(input_string)
    keep_list = ['PERSON', 'ORG', 'GROUP', 'GPE']
    return_string = ','.join([X.text.strip(" 's") for X in doc.ents if 
                              X.label_ in keep_list and X.text not in bad_list])
#     for x in bad_list:
#         return_string = return_string.replace(x,'')
    return return_string

df['list_of_people'] = df['desc_clean'].apply(lambda X: sparce(X))
df.head()

Unnamed: 0,season_title,episode_num,episode_title,air_date,IMDB_rating,total_votes,desc,season_num,producer,desc_clean,list_of_people
0,Mighty Morphin (Season 1),1,Day of the Dumpster,8/28/1993,7.4,687,Following the accidental release of long-impri...,1,Saban,Following accidental release long-imprisoned e...,"Rita Repulsa,Zordon"
1,Mighty Morphin (Season 1),2,High Five,9/4/1993,6.9,564,Rita plans to trap the Rangers in a time trap ...,1,Saban,Rita plans trap Rangers time trap like one tra...,Zordon
2,Mighty Morphin (Season 1),3,Teamwork,9/8/1993,7.3,546,Trini and Kimberly set up a petition to clean ...,1,Saban,Trini Kimberly set petition clean pollution pl...,"Trini Kimberly,Goldar"
3,Mighty Morphin (Season 1),4,A Pressing Engagement,9/9/1993,6.9,535,Jason is trying to break the bench press recor...,1,Saban,Jason trying break bench press record set Bulk...,"Bulk,Ernie,Jason"
4,Mighty Morphin (Season 1),5,Different Drum,9/10/1993,6.6,516,Kimberly's deaf friend feels out of place in d...,1,Saban,Kimberly 's deaf friend feels place dance clas...,Rita


In [35]:
df['list_of_people'] = df['list_of_people'].apply(lambda x: x.replace(' &', ','))

In [36]:
df.to_csv('PowerR.csv')

In [37]:
dfR = pd.read_csv('PowerR.csv')
dfR

Unnamed: 0.1,Unnamed: 0,season_title,episode_num,episode_title,air_date,IMDB_rating,total_votes,desc,season_num,producer,desc_clean,list_of_people
0,0,Mighty Morphin (Season 1),1,Day of the Dumpster,8/28/1993,7.4,687,Following the accidental release of long-impri...,1,Saban,Following accidental release long-imprisoned e...,"Rita Repulsa,Zordon"
1,1,Mighty Morphin (Season 1),2,High Five,9/4/1993,6.9,564,Rita plans to trap the Rangers in a time trap ...,1,Saban,Rita plans trap Rangers time trap like one tra...,Zordon
2,2,Mighty Morphin (Season 1),3,Teamwork,9/8/1993,7.3,546,Trini and Kimberly set up a petition to clean ...,1,Saban,Trini Kimberly set petition clean pollution pl...,"Trini Kimberly,Goldar"
3,3,Mighty Morphin (Season 1),4,A Pressing Engagement,9/9/1993,6.9,535,Jason is trying to break the bench press recor...,1,Saban,Jason trying break bench press record set Bulk...,"Bulk,Ernie,Jason"
4,4,Mighty Morphin (Season 1),5,Different Drum,9/10/1993,6.6,516,Kimberly's deaf friend feels out of place in d...,1,Saban,Kimberly 's deaf friend feels place dance clas...,Rita
...,...,...,...,...,...,...,...,...,...,...,...,...
915,915,Beast Morphers (Season 2),18,Crunch Time,11/28/2020,9.1,21,"Devon falls victim to a bad influence, which a...",27,Saban Brands,"Devon falls victim bad influence , affects rel...","Devon,Cruise"
916,916,Beast Morphers (Season 2),19,Source Code,12/5/2020,9.7,30,Nate makes a horrifying discovery and must rel...,27,Saban Brands,Nate makes horrifying discovery must rely wisd...,
917,917,Beast Morphers (Season 2),20,Evox Unleashed,12/12/2020,9.4,30,"Evox finally executes his master plan, and the...",27,Saban Brands,"Evox finally executes master plan , Beast Morp...",Beast Morphers Rangers
918,918,Beast Morphers (Season 2),21,Boxed In,4/18/2020,8.1,27,The Pan Global Games have arrived in Coral Har...,27,Saban Brands,"The Pan Global Games arrived Coral Harbor , Ra...","The Pan Global Games,Ranger"


In [23]:
def sparce(input_string):
    doc = nlp(input_string)
    return[(X.text, X.label_) for X in doc.ents]

df['list_of_people'] = df['desc_clean'].apply(lambda X: sparce(X))
#df.head()
df['list_of_people'][3]

[('Bulk', 'PERSON'),
 ('First', 'ORDINAL'),
 ("Juice Bar 's", 'PERSON'),
 ('Ernie', 'PERSON'),
 ('Jason', 'PERSON')]