# Project 3 Part 1


## Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
chunk_num = 1
akas = pd.read_csv(akas_url, sep='\t', low_memory=False, chunksize = 100000)

In [None]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
chunk_num = 1
basics = pd.read_csv(basics_url, sep='\t', low_memory=False, chunksize = 100000)

In [None]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
chunk_num = 1
ratings = pd.read_csv(ratings_url, sep = "\t", low_memory = False, chunksize = 100000)

## Cleaning & Chunking

In [None]:
for temp_df1 in akas:
        #### COMBINED WORKFLOW FROM ABOVE
    ## Replace "\N" with np.nan
    temp_df1.replace({'\\N':np.nan},inplace=True)
     
    ### Saving chunk to disk
    fname1= f'Data/title_akas_chunk_{chunk_num:03d}.csv.gz'
    temp_df1.to_csv(fname1, compression='gzip')
    print(f"- Saved {fname1}")
    
    ## increment chunk_num    
    chunk_num+=1
## Closing the reader now that we are done looping through the file
akas.close()



In [None]:
for temp_df2 in basics:
        #### COMBINED WORKFLOW FROM ABOVE
    ## Replace "\N" with np.nan
    temp_df2.replace({'\\N':np.nan},inplace=True)
    ## Eliminate movies that are null for runtimeMinute, genres, and startYear
    temp_df2 = temp_df2.dropna(subset=['runtimeMinutes','genres','startYear']) 
        
    ## NOTE: THERE ARE ADDITIONAL REQUIRED FILTERING STEPS FOR THE PROJECT NOT SHOWN HERE
    ### Convert startyear to numeric for slicing
    ## convert numeric features
    temp_df2['startYear'] = temp_df2['startYear'].astype(float)
    ## keep startYear 2000-2022
    temp_df2 = temp_df2[(temp_df2['startYear']>=2000)&(temp_df2['startYear']<2022)]
     
    ### Saving chunk to disk
    fname2= f'Data/title_basics_chunk_{chunk_num:03d}.csv.gz'
    temp_df2.to_csv(fname2, compression='gzip')
    print(f"- Saved {fname2}")
    
    ## increment chunk_num    
    chunk_num+=1
## Closing the reader now that we are done looping through the file
basics.close()

In [None]:
for temp_df3 in ratings:
        #### COMBINED WORKFLOW FROM ABOVE
    ## Replace "\N" with np.nan
    temp_df3.replace({'\\N':np.nan},inplace=True)
     
    ### Saving chunk to disk
    fname3= f'Data/title_ratings_chunk_{chunk_num:03d}.csv.gz'
    temp_df3.to_csv(fname3, compression='gzip')
    print(f"- Saved {fname3}")
    
    ## increment chunk_num    
    chunk_num+=1
## Closing the reader now that we are done looping through the file
ratings.close()



In [None]:
import glob
q = "Data/title_basics_chunk*.csv.gz"
chunked_files1 = sorted(glob.glob(q))
# Showing the first 5
chunked_files1[:5]

In [None]:
q = "Data/title_akas_chunk*.csv.gz"
chunked_files2 = sorted(glob.glob(q))
# Showing the first 5
chunked_files2[:5]

In [None]:
q = "Data/title_ratings_chunk*.csv.gz"
chunked_files3 = sorted(glob.glob(q))
# Showing the first 5
chunked_files3[:5]

In [None]:
## Loading all files as df and appending to a list
df_list1 = []
for file1 in chunked_files1:
    temp_df1 = pd.read_csv(file1, index_col=0)
    df_list1.append(temp_df1)
    ## Concatenating the list of dfs into 1 combined
akas_combined = pd.concat(df_list1)
akas_combined

In [None]:

df_list2 = []
for file2 in chunked_files2:
    temp_df2 = pd.read_csv(file2, index_col=0, low_memory = False)
    df_list2.append(temp_df2)
    ## Concatenating the list of dfs into 1 combined
basics_combined = pd.concat(df_list2)
basics_combined

In [None]:

df_list3 = []
for file3 in chunked_files3:
    temp_df3 = pd.read_csv(file3, index_col=0)
    df_list3.append(temp_df3)
    ## Concatenating the list of dfs into 1 combined
ratings_combined = pd.concat(df_list3)
ratings_combined

In [None]:
## Saving the final combined dataframe
final_fname1 ='Data/title_akas_combined.csv.gz'
akas_combined.to_csv(final_fname1, compression='gzip', index=False)

In [None]:
## Saving the final combined dataframe
final_fname2 ='Data/title_basics_combined.csv.gz'
basics_combined.to_csv(final_fname2, compression='gzip', index=False)

In [None]:
## Saving the final combined dataframe
final_fname3 ='Data/title_ratings_combined.csv.gz'
ratings_combined.to_csv(final_fname3, compression='gzip', index=False)

## Creating a Folder

In [None]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


# Cite

 - This data is from https://www.imdb.com/interfaces/ (IMBD)