In [None]:
# Bring in my dependencies

import pandas as pd
import os, csv
import numpy as np
from sqlalchemy import create_engine
from itertools import chain

In [None]:
# Create a function I saw online for creating a list that chains through
# each value, and every time it finds a ',' it will separate and create a new row
# I will be using this function for the cast series

def chainer(s):
    return list(chain.from_iterable(s.str.split(', ')))

In [None]:
# Import my netflix data file, and turn it into a pandas dataframe then drop any NULL values

netflix_data = os.path.join('Resources','netflix','netflix_titles.csv')
netflix = pd.read_csv(netflix_data)
netflix_df = pd.DataFrame(netflix)
netflix_df = netflix_df.dropna()

In [None]:
# Creating a new copy of the dataframe, only bringing in the series that I need from the source dataframe

cleaned_stage1_netflix = netflix_df[['show_id','type','title','director','cast','release_year','description']].copy()

# Creating a number value that calculates the length of a series once it has been broken up by the ',' symbol
# I will be using this variable to break out the list of actors to individual values per row

lens = cleaned_stage1_netflix['cast'].str.split(', ').map(len)

# I now create a new dataframe that will repeat the values for all series depending upon the cast member that is
# present in the individual rows. i.e. if there is an original row with the data The Departed - leo dicaprio, matt damon, 
# jack nicholson - it will create three rows with the departed as the title - one for Leo, one for Matt, and one for Jack

cleaned_stage2_netflix = pd.DataFrame({
    'show_id':np.repeat(cleaned_stage1_netflix['show_id'],lens),
    'type':np.repeat(cleaned_stage1_netflix['type'],lens),
    'title':np.repeat(cleaned_stage1_netflix['title'],lens),
    'director':np.repeat(cleaned_stage1_netflix['director'],lens),
    'full_name':chainer(cleaned_stage1_netflix['cast']),
    'release_year':np.repeat(cleaned_stage1_netflix['release_year'],lens),
    'description':np.repeat(cleaned_stage1_netflix['description'],lens)
})

In [None]:
# Further cleaning of my dataframe, stripping the value of blanks in title and full_name and put them both into title case

cleaned_stage3_netflix = cleaned_stage2_netflix
cleaned_stage3_netflix['title'] = cleaned_stage3_netflix['title'].str.strip().str.title()
cleaned_stage3_netflix['full_name'] = cleaned_stage3_netflix['full_name'].str.strip().str.title()

# Here, I am creating a unique id for the pairing of the movie and actor/actress by generating a numbered range
# that is equal to the number of my pandas dataframe

ids = []
for x in range(len(cleaned_stage3_netflix['show_id'])):
    ids.append(x)

# Next, I create the pandas series, and insert it into the 0 position so that it will be placed in front
# of all other values
cleaned_stage3_netflix.insert(0,'id',ids)

# renaming dataframe for my personal use (makes my life a bit easier)

final_netflix_df = cleaned_stage3_netflix

In [None]:
# Import in my SAG awards data file, and turn it into a dataframe

sag_data = os.path.join('Resources','SAG','screen_actor_guild_awards.csv')
sag = pd.read_csv(sag_data)
sag_df = pd.DataFrame(sag)

# Need to slice the string values for a given award ceremony year by splitting the strings
# by a ' ' and then selecting the first value of each string
#
# had values like '2020 SAG Awards in Long Beach' and returned the value '2020'
#
# I also created a for loop that would save each year value to a list, so that I could insert
# them back into the dataframe

years = []

for x in range(len(sag_df['year'])-1):
    years.append(sag_df['year'].str.split(' ')[x][0])

In [None]:
# Here are some more transformations I made to the sag df
#
# Starting with appending the new years list to the dataframe

sag_df['years_cleaned'] = pd.Series(years)

# Creating a new dataframe copy from the original, selecting only the series that I need
# for my data merge

cleaned_stage1_sag = sag_df[['years_cleaned','category','full_name','show','won']].copy()

# I reset my index here in order to obtain unique id values for each of the awards

cleaned_stage2_sag = cleaned_stage1_sag.dropna().reset_index()

# Stripped and title cased that actor/actresses names and the show names to match the netflix dataframe 

cleaned_stage2_sag['full_name'] = cleaned_stage2_sag['full_name'].str.title().str.strip()
cleaned_stage2_sag['show'] = cleaned_stage2_sag['show'].str.title().str.strip()

# Renamed the individual dataframes series, and set the index to match the values in my SQL tables

cleaned_stage3_sag = cleaned_stage2_sag.rename(columns={'years_cleaned':'year','index':'id','show':'title'})
final_sag_df = cleaned_stage3_sag.set_index('id')

In [None]:
# With both of my dataframes made, I am ready to merge the data, I will be merging the information
# on title to see which of these match from the two dataframes

merge_test = pd.merge(final_netflix_df, final_sag_df, on='title')

# With the titles matched, I then create a new dataframe that only contains values where the actor/actress names from
# the Netflix data match 1-to-1 with the actor/actress names from the SAG data

merge_test = merge_test[merge_test['full_name_x'] == merge_test['full_name_y']]

# Next, I create a new dataframe - taking only the series that I need for my SQL tables, and renaming the full_name_x category to
# full_name

merge_test_stage2 = merge_test[['id','show_id','type','director','full_name_x','title','release_year','description','category','won']]
merge_test_stage3 = merge_test_stage2.rename(columns={'full_name_x':'full_name'})

In [None]:
# Now that I have completed my source dataframe creation by merging and cleaning my Netflix
# and SAG data, I am ready to export it to a csv and move on to creating the dataframes
# that will be used to populate my SQL tables.

merge_test_stage3.to_csv('Resources/cleaned_merged/cleaned_netflix_sag_data.csv')