# Set Up

## Global Variables

In [161]:
base_file_location =  "C:\\Users\\Boas\\Downloads\\movies_datasets\\un_zip data"
data_set_url_info = "https://developer.imdb.com/non-commercial-datasets/"

## Imports

In [211]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

## Functions

In [212]:
def get_list_of_file(input_file_location):
    files = [f for f in os.listdir(input_file_location) if os.path.isfile(os.path.join(input_file_location, f))]    
    return files


In [213]:
def read_file(file):
    df = pd.read_csv(file, sep='\t')
    return df

In [214]:
def start_file(file):
    print("*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*")
    print("File Name: ", file)
    

In [224]:
def end_file():
    print("*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*") 
    print()

In [225]:
def print_column_info(files):
    binary_columns = []
    homogenous_columns = []
    for file in files:
        start_file(file)
        curr_file = os.path.join(base_file_location,file)
        data = read_file(curr_file)
        columns = data.columns.tolist()
        for column in columns:
            num_unique = data[column].nunique()
            print(column, ":", num_unique)
            if num_unique == 1:
                homogenous_columns.append(column)
            elif num_unique == 2:
                binary_columns.append(column)
        end_file()
    return homogenous_columns, binary_columns

In [226]:
def print_list(data):
    if data:
        for item in data: 
            print(item)
    else: 
        print ("No data in list")

# Questions

## 1. Is the data homogenous in each column?

In [227]:
files = get_list_of_file(base_file_location)
binary_columns = []
homogenous_columns = []

homogenous_columns, binary_columns = print_column_info(files)

print("Columns with only a single value")
print_list(homogenous_columns)

print()
print("Columns with 2 unique values")
print_list(binary_columns)


*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  name.basics.tsv
nconst : 1413
primaryName : 1411
birthYear : 119
deathYear : 97
primaryProfession : 353
knownForTitles : 50
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.akas.tsv
titleId : 218
ordering : 34
title : 546
region : 57
language : 19
types : 8
attributes : 14
isOriginalTitle : 2
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.basics.tsv
tconst : 218
titleType : 8
primaryTitle : 217
originalTitle : 217
isAdult : 2
startYear : 59
endYear : 4
runtimeMinutes : 40
genres : 44
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.crew.tsv
tconst : 218
directors : 97
writers : 95
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.episode.tsv
tconst : 118
parentTconst : 5
seasonNumber : 5
episodeNumber : 26
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File 

## 2. How do you anticipate this data will be used by data analysts and scientists downstream?

In [249]:
response = requests.get(data_set_url_info)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, 'html.parser')

for h3 in soup.find_all("h3"):
    start_file(h3.text)
    descriptions = h3.find_next('ul')
    for li in descriptions.find_all("li"):
        print(li.text.strip())
    end_file()
        

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.akas.tsv.gz
titleId (string) - a tconst, an alphanumeric unique identifier of the title
ordering (integer) – a number to uniquely identify rows for a given titleId
title (string) – the localized title
region (string) - the region for this version of the title
language (string) - the language of the title
attributes (array) - Additional terms to describe this alternative title, not enumerated
isOriginalTitle (boolean) – 0: not original title; 1: original title
*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*

*-*-*-*-*-*-*-*-*-**-*-*-*-*-*-*-*
File Name:  title.basics.tsv.gz
tconst (string) - alphanumeric unique identifier of the title
titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
originalTitle (string) - original title, in the original language
isAdult (boolean) 

## 3. Does your answer to the last question give you an indication of how you can store the data for optimal querying speed and storage file compression?


It looks like everything might be inside the "title basic" table. It would be nice if the title basic didn't have any repeats/episodes for tv shows, mini series, etc. since we already have another table that has information about tv shows. 

## 4. What cleaning steps do you need to perform to make your dataset ready for consumption?


+ Get rid of all "adult films"
+ Get rid of "dupes" (episodes) only values in title.episode.tv.parentTconst should exsistin in title.basic. If it does exsist in title.basic, all of the data in title.basic should be transferred to title.episode. 
+ Pivot cells that have multiple data (arrays). ()

## 5. What wrangling steps do you need to perform to enrich your dataset with additional information?

In [None]:
I wil have to join, pivot, and add/delete specific rows of data to make sure they are in the correct table(s)