In [147]:
import pandas as pd
import numpy as np
import os
import re
import tensorflow as tf
import sklearn
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM

In [2]:
#Function for pre-processing text to take out common words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Preprocessing steps
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)
    # Removing punctuation and converting to lowercase
    words = [word.lower() for word in words if word.isalnum()]
    # Removing stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Skynet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Skynet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Skynet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('data_cleaned.csv')
df.dropna(subset=['Quote'], inplace=True)
# Display the first few rows of the DataFrame
df

Unnamed: 0,Movie Name,Actor Name,Character,Quote
0,Murderland,Ken Kushner,Teddy,Choose the doorway that starts you on your mag...
1,Murderland,Ken Kushner,Teddy,I didn't think he'd make it past Scraps.
2,Murderland,Ken Kushner,Teddy,Let's just see if he can make it into round tw...
3,Murderland,Ken Kushner,Teddy,Don't.
4,Murderland,Ken Kushner,Teddy,What ?
...,...,...,...,...
676435,Zulu Dawn,Burt Lancaster,Col. Durnford,MELVILL ride over this obstacle the Colours a...
676436,Zulu Dawn,Burt Lancaster,Col. Durnford,"For God's sake, hold them back! I'll get the h..."
676437,Zulu Dawn,Burt Lancaster,Col. Durnford,It's alright It's alright.
676438,Zulu Dawn,Burt Lancaster,Col. Durnford,hold the Colours aloft mockingly. Some ZULUS a...


In [144]:
#Define the top actors
actors_vc=df['Actor Name'].value_counts()
actors=actors_vc.index.to_list()
top_actors=actors[0:50]
print(actors_vc)
print(top_actors)


Robert De Niro       7448
Gene Hackman         6640
Nicolas Cage         5240
Jack Nicholson       4897
Denzel Washington    4661
                     ... 
Maurice Schutz        155
Corey Feldman         138
Victor Jory           105
Julie Delpy            98
Chad Everett           98
Name: Actor Name, Length: 668, dtype: int64
['Robert De Niro', 'Gene Hackman', 'Nicolas Cage', 'Jack Nicholson', 'Denzel Washington', 'Meg Ryan', 'Anthony Hopkins', 'Robert Duvall', 'Morgan Freeman', 'Brad Pitt', 'Michael Caine', 'George Clooney', 'Tom Wilkinson', 'Harvey Keitel', 'Arnold Schwarzenegger', 'Sigourney Weaver', 'Kurt Russell', 'Jeff Bridges', 'Tommy Lee Jones', 'Jason Bateman', 'Albert Brooks', 'Dan Aykroyd', 'DeForest Kelley', 'Susan Sarandon', 'Clint Eastwood', 'Michael Douglas', 'Christopher Plummer', 'Kevin Bacon', 'Robin Williams', 'William H. Macy', 'Stellan Skarsgård', 'Joe Don Baker', 'Cloris Leachman', 'Christopher Walken', 'Joaquin Phoenix', 'John Travolta', 'Jim Carrey', 'Catherin

robert de niro       7302
gene hackman         6508
nicolas cage         5131
jack nicholson       4817
denzel washington    4590
                     ... 
will smith            152
corey feldman         133
victor jory           105
julie delpy            97
chad everett           93
Name: Actor Name, Length: 668, dtype: int64
['robert de niro', 'gene hackman', 'nicolas cage', 'jack nicholson', 'denzel washington', 'meg ryan', 'anthony hopkins', 'robert duvall', 'morgan freeman', 'brad pitt', 'michael caine', 'george clooney', 'tom wilkinson', 'arnold schwarzenegger', 'harvey keitel', 'sigourney weaver', 'kurt russell', 'jeff bridges', 'tommy lee jones', 'jason bateman', 'albert brooks', 'deforest kelley', 'dan aykroyd', 'susan sarandon', 'clint eastwood', 'michael douglas', 'christopher plummer', 'kevin bacon', 'robin williams', 'william h. macy']


In [12]:
#Create separate dataframe with only top actors
df_top_actors=df.loc[df["Actor Name"].isin(top_actors)]
df_top_actors['Actor Name'].value_counts()

Robert De Niro           7448
Gene Hackman             6640
Nicolas Cage             5240
Jack Nicholson           4897
Denzel Washington        4661
Meg Ryan                 4609
Anthony Hopkins          4166
Robert Duvall            4107
Morgan Freeman           4106
Brad Pitt                3931
Michael Caine            3810
George Clooney           3703
Tom Wilkinson            3551
Arnold Schwarzenegger    3445
Harvey Keitel            3445
Sigourney Weaver         3326
Kurt Russell             3289
Jeff Bridges             3228
Tommy Lee Jones          3101
Jason Bateman            3019
Albert Brooks            2986
Dan Aykroyd              2968
DeForest Kelley          2966
Susan Sarandon           2965
Clint Eastwood           2904
Michael Douglas          2859
Christopher Plummer      2846
Kevin Bacon              2809
Robin Williams           2758
William H. Macy          2741
Name: Actor Name, dtype: int64

In [148]:
# Apply preprocessing to the 'Quote' column of top actors dataframe
df_top_actors['Quote'] = df_top_actors['Quote'].apply(preprocess_text)
df_top_actors.dropna(subset=['Quote'], inplace=True)
df_top_actors=df_top_actors.reset_index(drop='True')
# preprocessed DataFrame value counts
df_top_actors.head()

Unnamed: 0,Movie Name,Actor Name,Character,Quote,Quote Length
0,15 Minutes,Robert De Niro,Detective Eddie Flemming,say thing say open mouth,5
1,15 Minutes,Robert De Niro,Detective Eddie Flemming,fool around,2
2,15 Minutes,Robert De Niro,Detective Eddie Flemming,hear said,2
3,15 Minutes,Robert De Niro,Detective Eddie Flemming,want document trip america,4
4,15 Minutes,Robert De Niro,Detective Eddie Flemming,next could see document please,5


In [150]:
#Tokenize and pad the quote word sequences
tokenizer = Tokenizer(num_words=10000)  # Adjust based on your vocabulary size
tokenizer.fit_on_texts(df_top_actors['Quote'])  # texts should be your input data
sequences = tokenizer.texts_to_sequences(df_top_actors['Quote'])  # Convert text to integer sequences
X_count = pad_sequences(sequences, maxlen=max_quote_length)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

[[20, 25, 20, 139, 392],
 [782, 59],
 [107, 44],
 [6, 2113, 719, 687],
 [147, 24, 12, 2113, 72],
 [4897, 1473, 1093, 1043, 327],
 [41, 191, 2957],
 [63, 76, 1017],
 [309],
 [150, 95, 609, 821],
 [5132, 213],
 [72, 693, 22, 13, 827],
 [176],
 [5132, 213, 6, 66, 213, 269, 17, 11, 2113, 3042],
 [46, 98],
 [36, 4312, 1261, 1230, 1261, 1, 5, 111],
 [],
 [91, 1446, 152],
 [41, 191],
 [3, 436, 36],
 [436, 939],
 [436, 939],
 [315, 216, 1446, 152, 41, 191],
 [87, 550],
 [46, 87, 550, 1446, 152, 87, 550],
 [1708, 903],
 [170],
 [407],
 [407, 407, 12, 407],
 [46,
  52,
  12,
  407,
  284,
  207,
  567,
  47,
  5133,
  226,
  64,
  268,
  6,
  13,
  687,
  772,
  433,
  83,
  2114,
  772,
  273,
  62,
  91,
  329],
 [120, 750, 123, 750, 68, 16, 4153],
 [10, 4, 7527, 4481, 63, 76, 125, 125, 9, 2385, 183, 170, 192],
 [11, 91, 295, 4, 106, 162, 17, 5134],
 [3220,
  190,
  2159,
  4,
  1210,
  626,
  1000,
  642,
  4898,
  294,
  7,
  20,
  1447,
  2,
  2806,
  496,
  96,
  7,
  2,
  2806,
  2879,
  

In [71]:
# Encode the target labels and save classes as separate file
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_top_actors['Actor Name'])
np.save('classes.npy', label_encoder.classes_)

In [47]:
#Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_count, y_encoded, test_size=0.2, random_state=42)


In [72]:
# Initialize the model
e = Embedding(10000, 64, input_length=max_quote_length)
batch_size=64
model = Sequential()
model.add(e)
model.add(LSTM( 128 , dropout = 0.25, recurrent_dropout = 0.25))
model.add(Dense(30, activation = 'softmax' ))
model.summary()
model.compile( optimizer = "adam" , loss = 'sparse_categorical_crossentropy' , metrics = ['accuracy'] )
#define callbacks
early_stopper = EarlyStopping( monitor = 'accuracy' , min_delta = 0.0005, patience = 3 )
reduce_lr = ReduceLROnPlateau( monitor = 'loss' , patience = 2 , cooldown = 0)
callbacks = [ reduce_lr , early_stopper]
#fit and evaluate model
model.fit( X_train , y_train , batch_size = batch_size, epochs = 10, verbose = 1 , callbacks = callbacks)
score = model.evaluate( X_test , y_test , batch_size = batch_size)

print( "Accuracy: {:0.4}".format( score[1] ))

print( "Loss:", score[0] )



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 43, 64)            640000    
                                                                 
 lstm_5 (LSTM)               (None, 128)               98816     
                                                                 
 dense_5 (Dense)             (None, 30)                3870      
                                                                 
Total params: 742,686
Trainable params: 742,686
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.3116
Loss: 2.9269442558288574


In [73]:
# Make predictions
y_pred_all = model.predict(X_test)
y_pred=[]
for i in y_pred_all:
    i=list(i)
    y_pred.append(i.index(max(i)))
print(y_test)
print(y_pred)
# Calculate accuracy
accuracy = accuracy_score(list(y_test), y_pred)
print(f"Accuracy: {accuracy:.2f}")

[15 23 11 ...  8  2 14]
[21, 9, 27, 9, 22, 18, 6, 21, 8, 9, 7, 9, 22, 21, 3, 14, 10, 22, 26, 22, 8, 21, 22, 21, 22, 20, 7, 7, 1, 15, 7, 29, 22, 9, 4, 1, 22, 13, 3, 5, 0, 2, 21, 24, 8, 14, 17, 9, 20, 12, 29, 17, 12, 9, 22, 1, 21, 22, 23, 11, 9, 21, 1, 22, 10, 5, 24, 1, 0, 2, 22, 11, 9, 23, 2, 0, 9, 9, 23, 3, 16, 25, 1, 19, 26, 14, 3, 11, 17, 9, 17, 10, 20, 15, 11, 22, 8, 14, 0, 21, 26, 8, 2, 19, 11, 24, 25, 20, 25, 4, 6, 8, 3, 23, 22, 22, 13, 12, 9, 12, 13, 17, 21, 0, 16, 11, 18, 17, 22, 12, 9, 6, 12, 22, 24, 9, 19, 29, 22, 21, 9, 26, 13, 25, 24, 12, 1, 15, 9, 23, 9, 0, 15, 8, 3, 21, 1, 21, 9, 22, 22, 29, 18, 13, 21, 12, 14, 8, 25, 1, 17, 6, 22, 22, 20, 9, 22, 25, 2, 22, 5, 22, 8, 12, 23, 8, 17, 22, 16, 18, 22, 25, 26, 15, 14, 21, 2, 28, 22, 23, 22, 5, 12, 17, 22, 19, 27, 24, 9, 19, 21, 27, 17, 21, 14, 16, 1, 24, 16, 16, 0, 21, 9, 0, 24, 5, 6, 16, 19, 2, 17, 17, 8, 20, 8, 25, 22, 9, 21, 17, 4, 14, 24, 21, 8, 12, 23, 17, 8, 21, 20, 22, 27, 28, 21, 12, 29, 28, 22, 25, 5, 22, 15, 16, 10, 2

In [151]:
#Program for predicting actor from quote using the model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

def ActorPredictor(phrase):
    phrase_ls=[]
    for i in phrase:
        phrase_ls.append(preprocess_text(i))
    tokenizer = Tokenizer(num_words=10000)  # Adjust based on your vocabulary size
    #tokenizer.fit_on_texts(df_top_actors['Quote'])  # texts should be your input data
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    sequences = tokenizer.texts_to_sequences(phrase_ls)  # Convert text to integer sequences
    phrase_count = pad_sequences(sequences, maxlen=max_quote_length)
    actors_pred_all = model.predict(phrase_count)
    actors_pred=[]
    for i in actors_pred_all:
        i=list(i)
        actors_pred.append(i.index(max(i)))
    return label_encoder.inverse_transform(actors_pred)
    #return sequences

In [156]:
#Sample implementation of the program
ActorPredictor(["Andy is the best","I'll be back","hold colour aloft mockingly zulu wearing","human life murder film statute limitation","Mada is sleepy"])




array(['Morgan Freeman', 'Meg Ryan', 'DeForest Kelley', 'Robert De Niro',
       'DeForest Kelley'], dtype=object)