In [1]:
from datetime import datetime
def type_cast(data_type:'str', value):
    """
    type_cast converts the data into respective datatypes
    """
    if data_type == 'INT':
        value = value.replace('-','')
        return int(value)
    elif data_type =='DATE':
        value = value.replace('T',' ')
        value = value.replace('Z','')
        return datetime.strptime(value,'%Y-%m-%d %H:%M:%S')
    else:
        return str(value)

In [2]:
data_types = ['INT', 'STRING', 'STRING', 'STRING','STRING', 'STRING']
data_types_1 = ['INT', 'STRING', 'STRING','INT']
data_types_2 = ['STRING', 'STRING','INT','INT']
data_types_3 = ['INT','DATE','DATE']



def type_cast_row(data_types: 'list', data_row:'iterable'):
    """
    type_cast_row convert elements of each data_row into respective data types 
    INPUT: data_types : 'list': consists of data types that the data_row elements should be converted into.
           data_row : consists of values that should be converted
    RETURN: returns the list of value that properly converted
    """
    return [type_cast(type_, value) for type_, value in zip(data_types, data_row)]

In [3]:
import csv
import operator
from collections import namedtuple


def read_file(file_name:'FILE',data_types:'list'):
    """
    read_file reads,sorts,type casts the csv file and returns an iterator
    INPUT: file_name : contains the file_name to be read.
           data_types: list : consists the type of data that is present in the file.
    RETURN: an iterator : returns a row of the file with proper type conversions
    """
    with open(file_name) as f:
        rows = csv.reader(f, delimiter=',')
        header = next(rows)
        file_data = namedtuple('file_data', header)
        index = header.index('ssn')
        sort = sorted(rows,key = operator.itemgetter(index))
        for data in sort:
            row = type_cast_row(data_types,data)
            row = file_data(*row)
            yield row
    

### GOAL 1:
***To create iterators for each of the four files that contained cleaned up data, of the correct type (e.g. string, int, date, etc), and represented by a named tuple.***

In [4]:
personal_info = read_file('personal_info.csv',data_types)
vehicles = read_file('vehicles.csv',data_types_1)
empolyment = read_file('employment.csv',data_types_2)
update_status = read_file('update_status.csv',data_types_3)

In [5]:
next(personal_info),next(vehicles),next(empolyment),next(update_status)

(file_data(ssn=100539824, first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic'),
 file_data(ssn=100539824, vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993),
 file_data(employer='Stiedemann-Bailey', department='Research and Development', employee_id=290890771, ssn=100539824),
 file_data(ssn=100539824, last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30)))

### GOAL 2:
***Create a single iterable that combines all the columns from all the iterators.***

In [6]:

my_header = ['ssn', 'first_name', 'last_name', 'gender', 'language','vehicle_make', 'vehicle_model', 'model_year',
             'employer', 'department', 'employee_id','last_updated', 'created']
        

class Combines:
    """
    This is an iterable which returns the combine_data iterator
    """
    def __init__(self, personal_info,vehicles,empolyment,update_status):
        self.personal_info = personal_info
        self.vehicles = vehicles
        self.empolyment = empolyment
        self.update_status = update_status

    def __iter__(self):
        return Combines.combine_data(self.personal_info,self.vehicles,self.empolyment,self.update_status)  
    
    @staticmethod
    def combine_data(personal_info,vehicles,empolyment,update_status):
        """
        This function combines data from the files and returns an iterator which yields a namedtuple
        INPUT: 'personal_info,vehicles,empolyment,update_status': files that should be combined
        RETURNS: an lazy iterator which iterates over namedtuple
        """
        personal_info = read_file(personal_info,data_types)
        vehicles = read_file(vehicles,data_types_1)
        empolyment = read_file(empolyment,data_types_2)
        update_status = read_file(update_status,data_types_3)

        #creating a namedtuple
        information = namedtuple('information',my_header)

        for i in range(1000):
            data = next(personal_info)
            ssn, *data1= next(vehicles)
            *data2, ssn = next(empolyment)
            ssn, *data3 = next(update_status)
            info = information(*data,*data1, *data2, *data3)
            yield info
        
        
files = 'personal_info.csv','vehicles.csv','employment.csv','update_status.csv'
combine = Combines(*files)

In [7]:
[data for data in combine]

[information(ssn=100539824, first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993, employer='Stiedemann-Bailey', department='Research and Development', employee_id=290890771, last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30)),
 information(ssn=101714702, first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao', vehicle_make='Ford', vehicle_model='Mustang', model_year=1997, employer='Nicolas and Sons', department='Sales', employee_id=416841359, last_updated=datetime.datetime(2017, 1, 23, 11, 23, 17), created=datetime.datetime(2016, 1, 27, 4, 32, 57)),
 information(ssn=101840356, first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish', vehicle_make='GMC', vehicle_model='Yukon', model_year=2005, employer='Connelly Group', department='Research and Development', employee_id=987952860, last_updated=date

In [8]:
combine_iter = iter(combine)
next(combine_iter)

information(ssn=100539824, first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993, employer='Stiedemann-Bailey', department='Research and Development', employee_id=290890771, last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30))

### GOAL 3
***Create an iterator that only contains current records (i.e. not stale)***

In [11]:
def current_records(date:'date_time_object'):
    """
    This function creates namedtuple of record which doesnot contain any stale records
    """
    combine = Combines.combine_data(*files)
    current_info = namedtuple('current_info',my_header)
    for data in combine:
        if data.last_updated > date:
            current_info_data = current_info(*data)
            yield current_info_data


date = datetime(2017,3,1,0,0,0)        
current_record = current_records(date)        
    

In [12]:
for line in current_record:
    print(line, end = '\n\n')

current_info(ssn=100539824, first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993, employer='Stiedemann-Bailey', department='Research and Development', employee_id=290890771, last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30))

current_info(ssn=101840356, first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish', vehicle_make='GMC', vehicle_model='Yukon', model_year=2005, employer='Connelly Group', department='Research and Development', employee_id=987952860, last_updated=datetime.datetime(2017, 10, 4, 11, 21, 30), created=datetime.datetime(2016, 9, 21, 23, 4, 7))

current_info(ssn=104220928, first_name='Justinian', last_name='Kunzelmann', gender='Male', language='Dhivehi', vehicle_make='Oldsmobile', vehicle_model='Intrigue', model_year=2000, employer='Upton LLC', department='Marketing', employee_id=569817552, last_up

### GOAL 4
***Find the largest group of car makes for each gender.***

In [13]:
def largest_car_maker():
    """
    This function returns two dictionaries which consists of 
    count of number of cars made by each car_maker for each gender.
    
    """
    combine = Combines.combine_data(*files)
    females = dict()
    males = dict()
    for data in combine:
        if data.gender == 'Female':
            females[data.vehicle_make] = females.get(data.vehicle_make,0)+1
        else:
            males[data.vehicle_make] = males.get(data.vehicle_make,0)+1
    return females,males
            

In [14]:
females, males = largest_car_maker()


In [15]:
def maximum(dictionary:'dict'):
    """
    This function returns a list which consists of car_makers with maximum count.
    """
    itemMaxValue = max(dictionary.items(), key=lambda x: x[1])
    listOfKeys = list()
    # Iterate over all the items in dictionary to find keys with max value
    for key, value in dictionary.items():
        if value == itemMaxValue[1]:
            listOfKeys.append(key)
    return listOfKeys

***Largest group of car makes for FEMALE gender.***

In [16]:
maximum(females)

['Ford', 'Chevrolet']

***Largest group of car makes for MALE gender.***

In [17]:
maximum(males)

['Ford']