Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

<h1>Import all libraries</h1>

In [None]:
import csv, json, gensim, datetime, time, random
import nltk
import re
import gensim
import numpy as np
import pandas as pd
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_columns', 100000)
pd.set_option('display.max_rows', 100000)

stop_list = nltk.corpus.stopwords.words('english') #creating list of stopwords
stemmer = nltk.stem.porter.PorterStemmer() #stemmer

<h1>Feature Selection (Overview)</h1>

In [None]:
all_df = pd.read_csv("tmdb_All_movies.csv", header = 0)
corpuslist = all_df["overview"]
corpuslist = corpuslist.replace([np.inf, -np.inf, np.nan], " ") #removing infinite/nan values

overviews = []

def processOverview(overview):
    eachwordinoverview = nltk.word_tokenize(overview)
    text1 = [w.lower() for w in eachwordinoverview] #lower case the words
    text2 = [w for w in text1 if re.search('^[a-z]+$', w)] #removing special characters and numbers
    text3 = [w for w in text2 if w not in stop_list] #removing words in stop list
    text4 = [stemmer.stem(w) for w in text3] #changing the words into its root form
    
    return text4   

for overview in corpuslist:
    eachwordinoverview = processOverview(overview)
    overviews += eachwordinoverview

corpuslist = all_df["tagline"]
corpuslist = corpuslist.replace([np.inf, -np.inf, np.nan], " ") #removing infinite/nan values

for tagline in corpuslist:
    eachwordinoverview = processOverview(tagline)
    overviews += eachwordinoverview

corpuslist = overviews

#The below code is for the feature set definition. We are using only top 5000 words as our features 
fdist = nltk.FreqDist(w.lower() for w in corpuslist)

totaluniquewords = 0
for word in fdist:
    totaluniquewords+=1
print("Total Unique Words:", totaluniquewords)

datasize = 5000

mostcommonwords = fdist.most_common()[:datasize] #top 5k
mostcommonwords = [w[0] for w in mostcommonwords]

print("Total Most Common Words:", len(mostcommonwords))
print(mostcommonwords[:10])

<h1>Feature Selection (Production Companies)</h1>

In [None]:
all_df = pd.read_csv("tmdb_All_movies.csv", header = 0)
corpuslist = all_df["production_companies"]

productioncompanies = []

def readJSON(company, keyword):
    list = []
    jsonobj = json.loads(company) #loading the json string into a json object
    
    for jsonelement in jsonobj:
        keywordelement = jsonelement[keyword] #getting each word out
        keywordelement = keywordelement.lower().replace(" ", "") #lower case the words, removing all whitespaces 
        if re.search('^[a-z]+$', keywordelement):
            list.append(keywordelement)
            
    return list

for company in corpuslist:
    eachwordincompany = readJSON(company, 'name')
    productioncompanies += eachwordincompany

corpuslist = productioncompanies
    
#The below code is for the feature set definition. We are using only top 1000 companies as our features 
fdist2 = nltk.FreqDist(w.lower() for w in corpuslist)

totaluniquecompanies = 0
for word in fdist2:
    totaluniquecompanies+=1
print("Total Unique Companies:", totaluniquecompanies)

datasize = 1000

mostcommoncompanies = fdist2.most_common()[:datasize] #top 1k
mostcommoncompanies = [w[0] for w in mostcommoncompanies]

print("Total Most Common Companies:", len(mostcommoncompanies))
print(mostcommoncompanies[:10])

<h1>Procesing tmdb_All_movies.csv (Movies)</h1>

In [None]:
idlist = []
titlelist = []
genreslist = []
overviewlist = []
productioncompanieslist = []

def readJSON(row, rownumber, list, keyword):
    jsonstring = row[rownumber]

    jsonobj = json.loads(jsonstring) #loading the json string into a json object
    
    for jsonelement in jsonobj:
        keywordelement = jsonelement[keyword] #getting each word out
        keywordelement = keywordelement.lower().replace(" ", "") #lower case the words, removing all whitespaces 
        if re.search('^[a-z]+$', keywordelement):
            if rownumber == 8: #production companies
                if keywordelement in mostcommoncompanies: #selecting top 1k companies as our features
                    list.append(keywordelement)
            else: #genres
                list.append(keywordelement)
    
    list = ' '.join(list) #changing list into a string
    
    return list
    
def processText(row, rownumber1, rownumber2, list):
    text = nltk.word_tokenize(row[rownumber1] + " " + row[rownumber2]) #contains overview and tagline
    text1 = [w.lower() for w in text] #lower case the words
    text2 = [w for w in text1 if re.search('^[a-z]+$', w)] #removing special characters and numbers
    text3 = [w for w in text2 if w not in stop_list] #removing words in stop list
    text4 = [stemmer.stem(w) for w in text3] #changing the words into its root form

    #text5 = text4 #no feature selection
    text5 = [w for w in text4 if w in mostcommonwords] #selecting top 5k words as our features
    
    list = ' '.join(text5) #changing list into a string
    
    return list
    
#read file
with open('tmdb_All_movies.csv', encoding='utf-8') as csv_file: #change accordingly
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        try:        
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else: #if line_count != 1000:
                genres = []
                productioncompanies = []
                overview = []
                      
                if row[1] != "":
                    #handle json genres
                    genres = readJSON(row, 1, genres, 'name')

                #handle overview
                overview = processText(row, 6, 15, overview)

                if row[8] != "":
                    #handle json productioncompanies
                    productioncompanies = readJSON(row, 8, productioncompanies, 'name')

                idlist.append(row[3]) #id
                titlelist.append(row[16]) #title
                genreslist.append(genres)  
                overviewlist.append(overview)
                productioncompanieslist.append(productioncompanies)

                line_count += 1
        except Exception as e:
            print(f'Row: {line_count} has Exception' + str(e))    
            line_count += 1          
    print(f'Processed {line_count} lines.')              

def convertToDataframe(listofwords, idlist, titlelist):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['id'] = idlist
    df['title'] = titlelist
                      
    return df

dfgenres = convertToDataframe(genreslist, idlist, titlelist)
print(dfgenres.head(10))
                      
dfoverview = convertToDataframe(overviewlist, idlist, titlelist)
print(dfoverview.head(10))
                      
dfproductioncompanies = convertToDataframe(productioncompanieslist, idlist, titlelist)
print(dfproductioncompanies.head(10)) 

print('\nOutput Success!')                 

<h1>Save to Dataframe</h1>

In [None]:
dfgenres.to_pickle("dfgenres")
dfoverview.to_pickle("dfoverview")
dfproductioncompanies.to_pickle("dfproductioncompaniesmostcommon")

<h1>Processing tmdb_All_credits.csv (Credits)</h1>

In [None]:
idlist = []
castslist = []
directorslist = []

def readJSON(row, rownumber, list, keyword):
    jsonstring = row[rownumber]

    jsonobj = json.loads(jsonstring) #loading the json string into a json object
    
    for jsonelement in jsonobj:
        keywordelement = jsonelement[keyword] #getting each word out
        keywordelement = keywordelement.lower().replace(" ", "") #lower case the words, removing all whitespaces 
        list.append(keywordelement) 
    
    list = ' '.join(list) #changing list into a string
    
    return list

#read top 1k actors and actresses file
topcasts = pd.read_csv("top_actors_actresses.csv", encoding="ISO-8859-1") 
namesoftopcasts = topcasts['Name'].values.tolist()
processedcastnames = []

for name in namesoftopcasts:
    name = name.lower().replace(" ", "") #lower case the words, removing all whitespaces 
    processedcastnames.append(name)  
    
#read top directors file
topdirectors = pd.read_csv("top_directors.csv", encoding="ISO-8859-1") 
namesoftopdirectors = topdirectors['Name'].values.tolist()
processeddirectorsnames = []

for name in namesoftopdirectors:
    name = name.lower().replace(" ", "") #lower case the words, removing all whitespaces 
    processeddirectorsnames.append(name)      

#read file
with open('tmdb_All_credits.csv', encoding='utf-8') as csv_file: #change accordingly
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        try:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else: #if line_count != 1000:
                casts = []
                directors = []
                      
                if row[1] != "":
                    #handle json casts
                    casts = readJSON(row, 1, casts, 'name')                      
                
                #handle json directors 
                if row[2] != "":      
                    jsonstring = row[2]

                    jsonobj = json.loads(jsonstring) #loading the json string into a json object

                    for jsonelement in jsonobj:
                        job = jsonelement['job'] #getting each word out
                        if job == "Director": 
                            director = jsonelement['name'].lower().replace(" ", "") #lower case the words, removing all whitespaces 
                            directors.append(director) 

                    directors = ' '.join(directors) #changing list into a string                      
                
                idlist.append(row[0]) #id                      
                castslist.append(casts)
                directorslist.append(directors)         

                line_count += 1
        except Exception as e:
            print(f'Row: {line_count} has Exception' + str(e))    
            line_count += 1   

    print(f'Processed {line_count} lines.')
                  
def convertToDataframe(listofwords, idlist):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['id'] = idlist
                      
    return df

processedcastnames.append('id')
processeddirectorsnames.append('id')
                      
dfcasts = convertToDataframe(castslist, idlist)
dfcasts2 = dfcasts.loc[:, processedcastnames]

print(dfcasts2.head(10))
                      
dfdirectors = convertToDataframe(directorslist, idlist)
dfdirectors2 = dfdirectors.loc[:, processeddirectorsnames]

print(dfdirectors2.head(10))                        

print('\nOutput Success!')                  

<h1>Save to Dataframe</h1>

In [None]:
dfcasts2.to_pickle("dfcasts")
dfdirectors2.to_pickle("dfdirectors")