# Source and Pandas

In [9]:
import pandas as pd

In [10]:
# dataset path
dataset_path = "data/filteredData/dataset.csv"

In [11]:
# dataset initialization
dataset = pd.read_csv(dataset_path, sep=',', low_memory=False)

# Categorical Link Data

In [12]:
# general decade graphs where tconst ---> nconst
links_70s_tconst_nconst  = dataset[(dataset['startYear'] >= 1970) & (dataset['startYear'] < 1980)].filter(items=["tconst", "nconst"])
links_80s_tconst_nconst = dataset[(dataset['startYear'] >= 1980) & (dataset['startYear'] < 1990)].filter(items=["tconst", "nconst"])

In [13]:
# a set of categories
categories = {'cinematographer', 'archive_footage', 'self', 'composer', 'writer', 'actress', 'director', 'editor',
              'producer', 'archive_sound', 'actor', 'production_designer'}

# dataset where startYear in 70s grouped by category
category_links_70s = []
# dataset where startYear in 80s grouped by category
category_links_80s = []

# create and add dataframes
for category in categories:
    # data frame queries
    link_70s = dataset[((dataset['startYear'] >= 1970) & (dataset['startYear'] < 1980)) & (dataset['category'] == category)].filter(items=["tconst", "nconst"])
    link_80s = dataset[((dataset['startYear'] >= 1980) & (dataset['startYear'] < 1990)) & (dataset['category'] == category)].filter(items=["tconst", "nconst"])
    category_links_70s.append(link_70s)
    category_links_80s.append(link_80s)

In [14]:
# decade graphs where nconst ---> nconst, grouped by category

# 70s
categories_links_70s_nconst = []
for links_tconst_nconst in category_links_70s:
    links_nconst_nconst = links_tconst_nconst.merge(links_tconst_nconst, on='tconst', suffixes=('_source', '_target')).filter(items=['nconst_source', 'nconst_target'])
    categories_links_nconst_nconst = categories_links_70s_nconst.append(links_nconst_nconst)

# 80s
categories_links_80s_nconst = []
for links_tconst_nconst in category_links_80s:
    links_nconst_nconst = links_tconst_nconst.merge(links_tconst_nconst, on='tconst', suffixes=('_source', '_target')).filter(items=['nconst_source', 'nconst_target'])
    categories_links_nconst_nconst = categories_links_80s_nconst.append(links_nconst_nconst)

In [15]:
# create csv files for decade graphs where nconst ---> nconst, grouped by category
for i in range(len(categories)):
    #70s
    path = "data/EdgeData/1970s/Categorical/{}70s.csv".format(categories[i])
    categories_links_70s_nconst[i].rename(columns={"nconst_source":"Source", "nconst_target":"Target"}, inplace=True)
    categories_links_70s_nconst[i].to_csv(path, index = False)
    # 80s
    path = "data/EdgeData/1980s/Categorical/{}80s.csv".format(categories[i])
    categories_links_80s_nconst[i].rename(columns={"nconst_source":"Source", "nconst_target":"Target"}, inplace=True)
    categories_links_80s_nconst[i].to_csv(path, index = False)

# General Link Data

In [16]:
# general decade graphs where tconst ---> nconst
links_70s_tconst_nconst  = dataset[(dataset['startYear'] >= 1970) & (dataset['startYear'] < 1980)].filter(items=["tconst", "nconst"])
links_80s_tconst_nconst = dataset[(dataset['startYear'] >= 1980) & (dataset['startYear'] < 1990)].filter(items=["tconst", "nconst"])

In [17]:
# general decade graphs where nconst ---> nconst
links_70s_nconst_nconst = links_70s_tconst_nconst.merge(links_70s_tconst_nconst, on='tconst', suffixes=('_source', '_target')).filter(items=['nconst_source', 'nconst_target'])
links_80s_nconst_nconst = links_80s_tconst_nconst.merge(links_80s_tconst_nconst, on='tconst', suffixes=('_source', '_target')).filter(items=['nconst_source', 'nconst_target'])

In [18]:
# ensure that no self collaborations
links_70s_nconst_nconst = links_70s_nconst_nconst[(links_70s_nconst_nconst['nconst_source'] != links_70s_nconst_nconst['nconst_target'])]
links_80s_nconst_nconst = links_80s_nconst_nconst[(links_80s_nconst_nconst['nconst_source'] != links_80s_nconst_nconst['nconst_target'])]

In [19]:
# create csv files for decade graphs where nconst ---> nconst
path70s = "data/EdgeData/1970s/General/70s.csv"
path80s = "data/EdgeData/1980s/General/80s.csv"

links_70s_nconst_nconst.rename(columns={"nconst_source":"Source", "nconst_target":"Target"}, inplace=True)
links_80s_nconst_nconst.rename(columns={"nconst_source":"Source", "nconst_target":"Target"}, inplace=True)

links_70s_nconst_nconst.to_csv(path70s, index = False)
links_80s_nconst_nconst.to_csv(path80s, index = False)