<h1>Procesing tmdb_All_movies.csv (Movies)</h1>

In [2]:
import csv, json, gensim, datetime, time
import nltk
import re
import gensim
import numpy as np
import pandas as pd
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

idlist = []
titlelist = []
genreslist = []
overviewlist = []
productioncompanieslist = []

stop_list = nltk.corpus.stopwords.words('english') #creating list of stopwords
stemmer = nltk.stem.porter.PorterStemmer() #stemmer

def readJSON(row, rownumber, list, keyword):
    jsonstring = row[rownumber]

    jsonobj = json.loads(jsonstring) #loading the json string into a json object
    
    for jsonelement in jsonobj:
        keywordelement = jsonelement[keyword] #getting each word out
        keywordelement = keywordelement.lower().replace(" ", "") #lower case the words, removing all whitespaces 
        if re.search('^[a-z]+$', keywordelement):
            list.append(keywordelement)
    
    list = ' '.join(list) #changing list into a string
    
    return list
    
def processText(row, rownumber1, rownumber2, list):
    text = nltk.word_tokenize(row[rownumber1] + " " + row[rownumber2]) #contains overview and tagline
    text1 = [w.lower() for w in text] #lower case the words
    text2 = [w for w in text1 if re.search('^[a-z]+$', w)] #removing special characters and numbers
    text3 = [w for w in text2 if w not in stop_list] #removing words in stop list
    text4 = [stemmer.stem(w) for w in text3] #changing the words into its root form
     
    list = ' '.join(text4) #changing list into a string
    
    return list
    
#read file
with open('tmdb_All_movies.csv', encoding='utf-8') as csv_file: #change accordingly
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        try:        
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else: #if line_count != 1000:
                genres = []
                productioncompanies = []
                overview = []
                      
                if row[1] != "":
                    #handle json genres
                    genres = readJSON(row, 1, genres, 'name')

                #handle overview
                overview = processText(row, 6, 15, overview)

                if row[8] != "":
                    #handle json productioncompanies
                    productioncompanies = readJSON(row, 8, productioncompanies, 'name')

                idlist.append(row[3]) #id
                titlelist.append(row[16]) #title
                genreslist.append(genres)  
                overviewlist.append(overview)
                productioncompanieslist.append(productioncompanies)

                line_count += 1
        except Exception as e:
            print(f'Row: {line_count} has Exception' + str(e))    
            line_count += 1          
    print(f'Processed {line_count} lines.')              

def convertToDataframe(listofwords, idlist, titlelist):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['id'] = idlist
    df['title'] = titlelist
                      
    return df

dfgenres = convertToDataframe(genreslist, idlist, titlelist)
print(dfgenres.head(10))
                      
dfoverview = convertToDataframe(overviewlist, idlist, titlelist)
print(dfoverview.head(10))
                      
dfproductioncompanies = convertToDataframe(productioncompanieslist, idlist, titlelist)
print(dfproductioncompanies.head(10)) 

print('\nOutput Success!')                 

Column names are budget, genres, homepage, id, original_language, original_title, overview, popularity, production_companies, production_countries, release_date, revenue, runtime, spoken_languages, status, tagline, title, vote_average, vote_count
Processed 23580 lines.
  action adventure animation comedy crime documentary drama family fantasy  \
0    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
1    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
2    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
3    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
4    0.0       0.0       0.0    0.0   0.0         0.0   1.0    0.0     0.0   
5    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
6    0.0       0.0       0.0    0.0   0.0         1.0   0.0    0.0     0.0   
7    0.0       0.0       0.0    0.0   0.0         0.0   0.0    0.0     0.0   
8    0.0       0.0       0.0

<h1>Save to Dataframe</h1>

In [3]:
dfgenres.to_pickle("dfgenres")
dfoverview.to_pickle("dfoverview")
dfproductioncompanies.to_pickle("dfproductioncompanies")

<h1>Processing tmdb_All_credits.csv (Credits)</h1>

In [1]:
import csv, json, gensim, datetime, time
import nltk
import re
import gensim
import numpy as np
import pandas as pd
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

idlist = []
castslist = []
directorslist = []

stop_list = nltk.corpus.stopwords.words('english') #creating list of stopwords
stemmer = nltk.stem.porter.PorterStemmer() #stemmer

def readJSON(row, rownumber, list, keyword):
    jsonstring = row[rownumber]

    jsonobj = json.loads(jsonstring) #loading the json string into a json object
    
    for jsonelement in jsonobj:
        keywordelement = jsonelement[keyword] #getting each word out
        keywordelement = keywordelement.lower().replace(" ", "") #lower case the words, removing all whitespaces 
        list.append(keywordelement) 
    
    list = ' '.join(list) #changing list into a string
    
    return list

#read top 1k actors and actresses file
topcasts = pd.read_csv("top_actors_actresses.csv", encoding="ISO-8859-1") 
namesoftopcasts = topcasts['Name'].values.tolist()
processedcastnames = []

for name in namesoftopcasts:
    name = name.lower().replace(" ", "") #lower case the words, removing all whitespaces 
    processedcastnames.append(name)  
    
#read top directors file
topdirectors = pd.read_csv("top_directors.csv", encoding="ISO-8859-1") 
namesoftopdirectors = topdirectors['Name'].values.tolist()
processeddirectorsnames = []

for name in namesoftopdirectors:
    name = name.lower().replace(" ", "") #lower case the words, removing all whitespaces 
    processeddirectorsnames.append(name)      

#read file
with open('tmdb_All_credits.csv', encoding='utf-8') as csv_file: #change accordingly
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        try:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else: #if line_count != 1000:
                casts = []
                directors = []
                      
                if row[1] != "":
                    #handle json casts
                    casts = readJSON(row, 1, casts, 'name')                      
                
                #handle json directors 
                if row[2] != "":      
                    jsonstring = row[2]

                    jsonobj = json.loads(jsonstring) #loading the json string into a json object

                    for jsonelement in jsonobj:
                        job = jsonelement['job'] #getting each word out
                        if job == "Director": 
                            director = jsonelement['name'].lower().replace(" ", "") #lower case the words, removing all whitespaces 
                            directors.append(director) 

                    directors = ' '.join(directors) #changing list into a string                      
                
                idlist.append(row[0]) #id                      
                castslist.append(casts)
                directorslist.append(directors)         

                line_count += 1
        except Exception as e:
            print(f'Row: {line_count} has Exception' + str(e))    
            line_count += 1   

    print(f'Processed {line_count} lines.')
                  
def convertToDataframe(listofwords, idlist):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['id'] = idlist
                      
    return df

processedcastnames.append('id')
processeddirectorsnames.append('id')
                      
dfcasts = convertToDataframe(castslist, idlist)
dfcasts2 = dfcasts.loc[:, processedcastnames]

print(dfcasts2.head(10))
                      
dfdirectors = convertToDataframe(directorslist, idlist)
dfdirectors2 = dfdirectors.loc[:, processeddirectorsnames]

print(dfdirectors2.head(10))                        

print('\nOutput Success!')                  



Column names are id, cast, crew
Row: 3482 has ExceptionUnterminated string starting at: line 1 column 32735 (char 32734)
Row: 7899 has Exception'int' object is not iterable
Row: 7900 has Exception'int' object is not iterable
Row: 7901 has Exception'int' object is not iterable
Row: 7902 has Exception'int' object is not iterable
Row: 7903 has Exception'int' object is not iterable
Row: 7904 has Exception'int' object is not iterable
Row: 7905 has Exception'int' object is not iterable
Row: 7906 has Exception'int' object is not iterable
Row: 7907 has Exception'int' object is not iterable
Row: 7908 has Exception'int' object is not iterable
Row: 7909 has Exception'int' object is not iterable
Row: 7910 has Exception'int' object is not iterable
Row: 7911 has Exception'int' object is not iterable
Row: 7912 has Exception'int' object is not iterable
Row: 7913 has Exception'int' object is not iterable
Row: 7914 has Exception'int' object is not iterable
Row: 7915 has Exception'int' object is not iter

Row: 8468 has Exception'int' object is not iterable
Row: 8469 has Exception'int' object is not iterable
Row: 8470 has Exception'int' object is not iterable
Row: 8471 has Exception'int' object is not iterable
Row: 8472 has Exception'int' object is not iterable
Row: 8473 has Exception'int' object is not iterable
Row: 8474 has Exception'int' object is not iterable
Row: 8475 has Exception'int' object is not iterable
Row: 8476 has Exception'int' object is not iterable
Row: 8477 has Exception'int' object is not iterable
Row: 8478 has Exception'int' object is not iterable
Row: 8479 has Exception'int' object is not iterable
Row: 8480 has Exception'int' object is not iterable
Row: 8481 has Exception'int' object is not iterable
Row: 8482 has Exception'int' object is not iterable
Row: 8483 has Exception'int' object is not iterable
Row: 8484 has Exception'int' object is not iterable
Row: 8485 has Exception'int' object is not iterable
Row: 8486 has Exception'int' object is not iterable
Row: 8487 ha

Row: 10210 has ExceptionExpecting ':' delimiter: line 1 column 32760 (char 32759)
Row: 10451 has ExceptionExpecting ',' delimiter: line 1 column 32760 (char 32759)
Row: 10452 has ExceptionUnterminated string starting at: line 1 column 32758 (char 32757)
Row: 10601 has ExceptionExpecting value: line 1 column 32760 (char 32759)
Row: 10604 has ExceptionUnterminated string starting at: line 1 column 32748 (char 32747)
Row: 10808 has ExceptionUnterminated string starting at: line 1 column 32757 (char 32756)
Row: 10867 has ExceptionUnterminated string starting at: line 1 column 32740 (char 32739)
Row: 11001 has ExceptionUnterminated string starting at: line 1 column 32758 (char 32757)
Row: 11311 has ExceptionUnterminated string starting at: line 1 column 32748 (char 32747)
Row: 11316 has ExceptionUnterminated string starting at: line 1 column 32755 (char 32754)
Row: 11319 has ExceptionExpecting value: line 1 column 32758 (char 32757)
Row: 11421 has ExceptionUnterminated string starting at: l

  aamirkhan abdellatifkechiche adambrooks adamelliot adammckay adrianlyne  \
0       0.0                0.0        0.0        0.0       0.0        0.0   
1       0.0                0.0        0.0        0.0       0.0        0.0   
2       0.0                0.0        0.0        0.0       0.0        0.0   
3       0.0                0.0        0.0        0.0       0.0        0.0   
4       0.0                0.0        0.0        0.0       0.0        0.0   
5       0.0                0.0        0.0        0.0       0.0        0.0   
6       0.0                0.0        0.0        0.0       0.0        0.0   
7       0.0                0.0        0.0        0.0       0.0        0.0   
8       0.0                0.0        0.0        0.0       0.0        0.0   
9       0.0                0.0        0.0        0.0       0.0        0.0   

  akirakurosawa alanparker alberthughes alejandroamenábar   ...   wesanderson  \
0           0.0        0.0          0.0               0.0   ...        

<h1>Save to Dataframe</h1>

In [2]:
dfcasts2.to_pickle("dfcasts")
dfdirectors2.to_pickle("dfdirectors")

<h1>Merging Two Data Frames</h1>

In [4]:
import pandas as pd
import pickle

df3 = df.join(df2, lsuffix='id', rsuffix='id')
df3.head(10) 

df3 = df3.apply(pd.to_numeric, errors='ignore')
df3.dtypes

df3.to_pickle("processed_data")

print('\nOutput Success!')  

     budget music drama horror documentary thriller comedy crime western  \
0         0     0     0      0           0        0      0     0       0   
1  17000000     0     0      0           0        0      0     0       0   
2         0     0     0      0           0        0      0     0       0   
3         0     1     0      0           0        0      0     0       0   
4         0     0     1      0           0        0      0     0       0   
5   3500000     0     0      1           0        0      0     0       0   
6         0     0     0      0           1        0      0     0       0   
7         0     0     0      0           0        0      0     0       0   
8         0     0     0      0           0        0      0     0       0   
9         0     0     0      0           0        0      0     0       0   

  romance     ...      esther parobek frans zwartjes naoto takenaka  \
0       0     ...                   0              0              0   
1       0     ...    