### Interacting with OS and filesystem

In [1]:
import os

- os.getcwd() $\rightarrow$ working directory

In [2]:
os.getcwd()

'C:\\Users\\deepa\\OneDrive\\Documents\\New folder\\DAnalysis'

- To get list of files in present directory

In [4]:
os.listdir()

['.ipynb_checkpoints',
 'climate.txt',
 'climate_result',
 'Cost_to_rent.ipynb',
 'Data.txt',
 'Images',
 'numpy.ipynb',
 'part3.ipynb',
 'Untitled.ipynb',
 'venv']

In [11]:
os.listdir('.')    # relative path

['.ipynb_checkpoints',
 'climate.txt',
 'climate_result',
 'Cost_to_rent.ipynb',
 'Data.txt',
 'Images',
 'numpy.ipynb',
 'part3.ipynb',
 'Untitled.ipynb',
 'venv']

In [12]:
os.listdir('/users')   # absolute path

['All Users', 'deepa', 'Default', 'Default User', 'desktop.ini', 'Public']

#### Making new directory

In [14]:
os.makedirs('./data', exist_ok=True)

In [17]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])

    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [19]:
os.makedirs('./data', exist_ok=False)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: './data'

In [21]:
import urllib.request

In [26]:
urllib.request.urlretrieve('https://www.nass.usda.gov/Data_and_Statistics/County_Data_Files/Frequently_Asked_Questions/county_list.txt', './data/two.txt')

('./data/two.txt', <http.client.HTTPMessage at 0x22e66735f10>)

#### Reading from a file
- We use python built-in function 'open'.

In [34]:
file1 = open('./data/two.txt', mode='r')

In [35]:
file_content = file1.read()

In [36]:
print(file_content)

Listing of Counties and Districts used by USDA-NASS, updated 08-15-2007

     D                              
     i                              
S    s    C                            History 
t    t    o     State                   flag
a    r    u       or                1 = currently being used
t    i    n     County              2 = historical 
e    c    t      Name               
     t    y                         


01   00   000   Alabama						1
01   10   033   Colbert						1
01   10   057   Fayette						2
01   10   059   Franklin					1
01   10   075   Lamar						2
01   10   077   Lauderdale					1
01   10   079   Lawrence					1
01   10   083   Limestone					1
01   10   089   Madison						1
01   10   093   Marion						1
01   10   103   Morgan						1
01   10   133   Winston						1
01   10   888   D10 Combined Counties				1
01   10   999   D10 Northern Valley				1
01   20   009   Blount						1
01   20   015   Calhoun						1
01   20   019   Cherokee					1
01   20   029   Cleburne

In [37]:
file1.close()

#### With statement
- Open and close file automatically

In [39]:
with open('./data/two.txt', mode='r') as f:
    print(f.read())

Listing of Counties and Districts used by USDA-NASS, updated 08-15-2007

     D                              
     i                              
S    s    C                            History 
t    t    o     State                   flag
a    r    u       or                1 = currently being used
t    i    n     County              2 = historical 
e    c    t      Name               
     t    y                         


01   00   000   Alabama						1
01   10   033   Colbert						1
01   10   057   Fayette						2
01   10   059   Franklin					1
01   10   075   Lamar						2
01   10   077   Lauderdale					1
01   10   079   Lawrence					1
01   10   083   Limestone					1
01   10   089   Madison						1
01   10   093   Marion						1
01   10   103   Morgan						1
01   10   133   Winston						1
01   10   888   D10 Combined Counties				1
01   10   999   D10 Northern Valley				1
01   20   009   Blount						1
01   20   015   Calhoun						1
01   20   019   Cherokee					1
01   20   029   Cleburne

In [40]:
with open('./data/two.txt', mode='r') as f:
    file = f.readlines()

In [41]:
file

['Listing of Counties and Districts used by USDA-NASS, updated 08-15-2007\n',
 '\n',
 '     D                              \n',
 '     i                              \n',
 'S    s    C                            History \n',
 't    t    o     State                   flag\n',
 'a    r    u       or                1 = currently being used\n',
 't    i    n     County              2 = historical \n',
 'e    c    t      Name               \n',
 '     t    y                         \n',
 '\n',
 '\n',
 '01   00   000   Alabama\t\t\t\t\t\t1\n',
 '01   10   033   Colbert\t\t\t\t\t\t1\n',
 '01   10   057   Fayette\t\t\t\t\t\t2\n',
 '01   10   059   Franklin\t\t\t\t\t1\n',
 '01   10   075   Lamar\t\t\t\t\t\t2\n',
 '01   10   077   Lauderdale\t\t\t\t\t1\n',
 '01   10   079   Lawrence\t\t\t\t\t1\n',
 '01   10   083   Limestone\t\t\t\t\t1\n',
 '01   10   089   Madison\t\t\t\t\t\t1\n',
 '01   10   093   Marion\t\t\t\t\t\t1\n',
 '01   10   103   Morgan\t\t\t\t\t\t1\n',
 '01   10   133   Winston\t\t\t

### Processing data from files

- Read the file line by line
- Parse the first line to get a list of the column names or headers
- Split each remaining line and convert each value into a float
- Create a dictionary for each loan using the headers as keys
- Create a list of dictionaries to keep track of all the loans

In [43]:
with open('./data/loan.txt', mode='r') as f:
    file = f.readlines()

In [44]:
file

['amount,duration,rate,down_payment\n',
 '431840,11,6.22,37396\n',
 '394278,23,3.02,3278\n',
 '339930,57,3.46,17002\n',
 '343479,29,9.45,3353\n',
 '443019,25,2.14,23866\n',
 '41996,34,3.28,31997\n',
 '468381,40,2.59,1681\n',
 '386437,54,6.44,8016\n',
 '459304,14,6.82,49588\n',
 '64077,51,7.69,9884\n',
 '199348,27,5.22,2942\n',
 '138104,53,6.91,49964\n',
 '32107,60,2.18,18757\n',
 '87804,7,9.79,3165\n',
 '321609,27,7.73,20786\n',
 '162831,53,5.91,46571\n',
 '170657,58,2.06,6705\n',
 '78469,55,5.06,11189\n',
 '157647,45,6.05,41270\n',
 '476602,26,2.07,11168\n',
 '470327,14,4.25,2713\n',
 '74370,23,3.75,16305\n',
 '136866,49,2.88,2712\n',
 '454268,46,8.45,27703\n',
 '160329,51,2.02,7152\n',
 '120793,6,5.08,45665\n',
 '385793,9,4.98,47287\n',
 '234679,31,3.22,20635\n',
 '49191,8,8.75,28650\n',
 '419102,29,7.36,30754\n',
 '241823,42,4.29,34218\n',
 '61259,38,4.21,1403\n',
 '168699,37,5.61,5064\n',
 '381920,53,9.35,44308\n',
 '227776,45,6.85,12241\n',
 '337469,6,6.51,3664\n',
 '265391,11,3.4

Let's start by a function `parse_header` which takes a line as input and return a list of column header

In [45]:
def parse_header(header_line):
    return header_line.strip().split(',')

In [46]:
file[0]

'amount,duration,rate,down_payment\n'

In [69]:
headers = parse_header(file[0])

In [70]:
headers

['amount', 'duration', 'rate', 'down_payment']

Next, Let's define a function `parse_values` which takes a line containing some data, and returns a list of floating point numbers

In [56]:
def parse_values(data_line):
    values = []
    for item in data_line.strip().split(','):
        if item == '':
            values.append(0.0)
        else:
            values.append(float(item))
    return values

In [57]:
file[1]

'431840,11,6.22,37396\n'

In [58]:
file[1].strip().split(',')

['431840', '11', '6.22', '37396']

In [60]:
values = parse_values(file[1])

In [61]:
values

[431840.0, 11.0, 6.22, 37396.0]

Next, Let's define a function `create_dict` which takes a list of values and header as input, and return a dict

In [62]:
def create_dict(values, headers):
    result = {}
    for value, header in zip(values, headers):
        result[header] = value

    return result

In [63]:
values

[431840.0, 11.0, 6.22, 37396.0]

In [71]:
headers

['amount', 'duration', 'rate', 'down_payment']

In [72]:
create_dict(values, headers)

{'amount': 431840.0, 'duration': 11.0, 'rate': 6.22, 'down_payment': 37396.0}

#### The values and headers are combined to create a dictionary with approrpiate key-value pairs
- We are now ready to pull it all together and define a `read_csv` function

In [73]:
def read_csv(path):
    result = []
    with open(path, 'r') as f:
        # Get the list of lines
        lines = f.readlines()

        # Parse the Header
        header = parse_header(lines[0])

        # Loop over the remaining lines
        for line in lines[1: ]:
            # Parse the value
            value = parse_values(line)

            # create a dictionary
            item_dict = create_dict(value, header)

            # Append dict in result
            result.append(item_dict)

    return result

In [74]:
read_csv('./data/loan.txt')

[{'amount': 431840.0, 'duration': 11.0, 'rate': 6.22, 'down_payment': 37396.0},
 {'amount': 394278.0, 'duration': 23.0, 'rate': 3.02, 'down_payment': 3278.0},
 {'amount': 339930.0, 'duration': 57.0, 'rate': 3.46, 'down_payment': 17002.0},
 {'amount': 343479.0, 'duration': 29.0, 'rate': 9.45, 'down_payment': 3353.0},
 {'amount': 443019.0, 'duration': 25.0, 'rate': 2.14, 'down_payment': 23866.0},
 {'amount': 41996.0, 'duration': 34.0, 'rate': 3.28, 'down_payment': 31997.0},
 {'amount': 468381.0, 'duration': 40.0, 'rate': 2.59, 'down_payment': 1681.0},
 {'amount': 386437.0, 'duration': 54.0, 'rate': 6.44, 'down_payment': 8016.0},
 {'amount': 459304.0, 'duration': 14.0, 'rate': 6.82, 'down_payment': 49588.0},
 {'amount': 64077.0, 'duration': 51.0, 'rate': 7.69, 'down_payment': 9884.0},
 {'amount': 199348.0, 'duration': 27.0, 'rate': 5.22, 'down_payment': 2942.0},
 {'amount': 138104.0, 'duration': 53.0, 'rate': 6.91, 'down_payment': 49964.0},
 {'amount': 32107.0, 'duration': 60.0, 'rate': 2

#### Why did we do this?
- Now we have a list of dictionaries. It's easy to handle a list of dict compared to a CSV file

### This `read_csv` parse any csv file, with any numbers of rows and column

In [82]:
def parse_header(header_line):
    return header_line.strip().split(',')

def parse_values(data_line):
    values = []
    for item in data_line.strip().split(','):
        item = item.strip()
        try:
            values.append(float(item))  # Convert numeric values to float
        except ValueError:
            values.append(item)  # Keep text values as strings
    return values

def create_dict(values, headers):
    result = {}
    for value, header in zip(values, headers):
        result[header] = value

    return result

def read_csv(path):
    result = []
    with open(path, 'r') as f:
        lines = f.readlines()
        header = parse_header(lines[0])
        for line in lines[1: ]:
            value = parse_values(line)
            item_dict = create_dict(value, header)
            result.append(item_dict)

    return result

In [85]:
data = read_csv('./data/car_data.txt')

In [84]:
def write_csv(items, path):
    with open(path, 'w') as f:
        if len(items) == 0:
            return

        headers = list(items[0].keys())
        f.write(','.join(headers) + '\n')

        for item in items:
            values = []
            for header in headers:
                values.append(str(item.get(header, "")))
            f.write(','.join(values) + '\n')

In [86]:
write_csv(data, './data/new_items.txt')