In [None]:
# notebook dependencies 
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 300

import pandas as pd
import numpy as np
import os

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# time/sleep modules
from time import sleep

# random module for sleep combination
import random

import vaex as vx

from time import time

In [None]:
# importing parent table

df_parent = pd.read_csv("/Users/mijailmariano/codeup-data-science/capstone_working_repo/mijail_folder/FieldOfStudyData1718_1819_PP.csv")
print(f'df shape: {df_parent.shape}')
df_parent.head()

In [None]:
# changing df_parent unitid type to int64

df_parent["UNITID"] = df_parent["UNITID"].astype("Int64")

df_parent["UNITID"].dtype

In [None]:
# check conversion worked

df_parent.head() # checks out!

In [None]:
# importing the child table

df_child = pd.read_csv("/Users/mijailmariano/codeup-data-science/capstone_working_repo/mijail_folder/MERGED2018_19_PP.csv", low_memory=False)
print(f'df shape: {df_child.shape}')
df_child.head()

In [None]:
# info
df_child["UNITID"].dtype

In [None]:
df_child["UNITID"] = df_child["UNITID"].astype("Int64", errors="ignore")
df_child["UNITID"].dtype

In [None]:
# check conversion works

df_child.head() # looks good

In [None]:
# merging the two tables together

'''DataFrame.merge(
    right, 
    how='inner', 
    on=None, 
    left_on=None, 
    right_on=None, 
    left_index=False, 
    right_index=False, 
    sort=False, 
    suffixes=('_x', '_y'), 
    copy=True, 
    indicator=False, 
    validate=None)'''

df = df_parent.merge( 
    df_child,
    how = "left",
    on = "UNITID",
    copy = False
)

# parent table contains Null Values in "UNITID"
# there should be ~225K records returned and ~3Kish features returned

print(f'dataframe shape: {df.shape}')
df.head()

In [None]:
# dataframe shape: (224849, 3109)

df["UNITID"].isnull().sum()

In [None]:
# function to merge initial majors tables

def get_mass_majors_df():

    '''Function to initially pull and merge the two (2) needed 
    College Scorecard tables for period 2018-2019.'''

    # checking if dataset exists
    filename = "majors_table.csv"
    
    if os.path.isfile(filename):
        
        df = pd.read_csv(filename)

        print(f'dataframe shape: {df.shape}')

        return df

    else:
        filename_01 = "FieldOfStudyData1718_1819_PP.csv"
        filename_02 = "MERGED2018_19_PP.csv"
        
        df_parent = pd.read_csv(filename_01, low_memory=False)
        df_child = pd.read_csv(filename_02, low_memory=False)

        df_parent["UNITID"] = df_parent["UNITID"].astype("Int32", errors='ignore')
        df_child["UNITID"] = df_child["UNITID"].astype("Int32", errors='ignore')

        df = df_parent.merge( 
        df_child,
        how = "left",
        on = "UNITID",
        copy = False
        )
        # cache the newly created dataframe as a .csv file
        df.to_csv("majors_table.csv")
        # print the df shape
        print(f'dataframe shape: {df.shape}')

        # return the dataframe
        return df

In [None]:
df = get_mass_majors_df()

In [None]:
df_vx = vx.from_csv(
    "majors_table.csv", 
    convert = True)

(df_vx)

In [None]:
start_mem = df.memory_usage().sum() / 1024**2

print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

----
### ``compression functions and potential approaches``

In [None]:
from pathlib import Path
import zipfile

def save_compressed_df(df, dirPath, fileName):
    
    """Save a Pandas dataframe as a zipped .csv file.

    Parameters
    ----------
    df: pandas.core.frame.DataFrame
    Input dataframe.

    dirPath: str or pathlib.PosixPath
    Parent directory of the zipped file.

    fileName: str
    File name without extension.
    """

    dirPath = Path(dirPath)

    path_zip = dirPath / f'{fileName}.csv.zip'

    txt = df.to_csv(index=False)

    with zipfile.ZipFile(path_zip, 'w', zipfile.ZIP_DEFLATED) as zf:

        zf.writestr(f'{fileName}.csv', txt)

In [None]:
# iterate and write to file method?
# testing out the function

path = "/Users/mijailmariano/codeup-data-science/capstone_working_repo/"

save_compressed_df(df, path, "majors_table.csv")

In [None]:
def reduce_mem_usage(df):

    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:

        col_type = df[col].dtype
        
        if col_type != object:

            c_min = df[col].min()

            c_max = df[col].max()

            if str(col_type)[:3] == 'int':

                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:

                    df[col] = df[col].astype(np.int8)

                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:

                    df[col] = df[col].astype(np.int16)

                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:

                    df[col] = df[col].astype(np.int32)

                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:

                    df[col] = df[col].astype(np.int64)  

            else:
                
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:

                    df[col] = df[col].astype(np.float16)

                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:

                    df[col] = df[col].astype(np.float32)

                else:
                    df[col] = df[col].astype(np.float64)

        else:
            
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df = reduce_mem_usage(df)
print(f'df shape {df.shape}')
df.head()

In [None]:
# VAEX library
import vaex

In [None]:
# gzip method?

df.to_csv("/tmp/df.csv.gz", compression="gzip")