In [20]:
import pandas as pd
import chardet
from collections import defaultdict
import os
import time
import json

#define data path and col names
DATA_PATH="Prospect_Perfromance_by_Source_(SMD)_data.csv"
COL_NAMES=("University","Major","Medium","Source","KPI","Value")
SEP="\t"

def get_encoding(file_path:str)->str:
    with open(file_path, 'rb') as f:
        return chardet.detect(f.read())['encoding']


def getUniversityMajor(file_path,save_dir=".")->defaultdict:
    '''
    return the unique university and major record
    
    Args:
    file_path:csv file path
    
    save_dir:the directory to save checkpoint and record json file
    
    Returns:
    a map university to major  (a university has different major,so it is a list)
    '''
    file_encoding=get_encoding(file_path)
    data=pd.read_csv(file_path,sep=SEP,encoding=file_encoding,skiprows=1,names=COL_NAMES)
    duplicates_data=data.drop_duplicates(["University","Major"])
    
    rows=duplicates_data.shape[0]
    universityMajor=defaultdict(list)
    for index in range(rows):
        university=duplicates_data.iloc[index,0]
        major=duplicates_data.iloc[index,1]
        universityMajor[university].append(major)
    current=int(time.time())
    save_path=os.path.join(save_dir,"universityMajor_%s.json"%current)
#     save_path=os.path.abspath(save_path)
    print("save university majson record in %s"%save_path)
    checkpoint={
        "latest_file":save_path
    }
    
    with open("checkpoint.json","w") as f:
        f.write(json.dumps(checkpoint,ensure_ascii=False))
    
    with open(save_path,"w") as f:
        f.write(json.dumps(universityMajor,ensure_ascii=False))
        
    return universityMajor
        


In [21]:
getUniversityMajor(DATA_PATH)

save university majson record in ./universityMajor_1613377070.json


defaultdict(list,
            {'American University': ['EDD in Education Policy and Leadership',
              'M.Ed Education Policy and Leadership (Online)',
              'MAT: Elementary (Online)'],
             'Howard University': ['Online Executive MBA',
              'Online Part-time MBA'],
             'Mills College': ['Education - Educational Leadership (Online)'],
             'NYU': ['Online MHA'],
             'Southern Methodist University': ['MBA Direct',
              'MS in Computer Science with AI Specialization',
              'MS in Network Engineering',
              'Online MBA'],
             'Stevens Institute of Technology': ['MS in Computer Science',
              'MS in Data Science'],
             'Tufts University': ['Master of Science in Computer Science',
              'Master of Science in Data Science',
              'Post-Baccalaureate in Computer Science',
              'Post-Baccalaureate in Computer Science / MS'],
             'Tulane': ['On-camp