## How to import data

In [None]:
#for a csv, comma separated values you can use pandas
import pandas as pd
data = pd.read_csv('Popular_Baby_Names.csv')
data.head()

In [None]:
#check the raw format inside a csv
with open('Popular_Baby_Names.csv') as f:
    lines=f.readlines()
lines[:10]

##Different raw data format, our target is to transit needed data to pd acceptable format

In [None]:
## If you want to read files, you can use below format
with open("Popular_Baby_Names_space.txt") as f:   #change file name
    lines = f.readlines()         #read file content to lines

In [22]:
print(type(lines))                #print type of lines


<class 'list'>


In [23]:
len(lines)

57582

In [26]:
lines[0]

"Year_of_Birth Gender Child's_First_Name Count Rank\n"

In [27]:
lines[57582]

IndexError: list index out of range

In [28]:
lines[57581]

'2014 MALE Zev 49 65\n'

In [29]:
lines[-1]

'2014 MALE Zev 49 65\n'

In [31]:
print(lines[0:10])             #print lines, from row 0~10

["Year_of_Birth Gender Child's_First_Name Count Rank\n", '2011 FEMALE GERALDINE 13 75\n', '2011 FEMALE GIA 21 67\n', '2011 FEMALE GIANNA 49 42\n', '2011 FEMALE GISELLE 38 51\n', '2011 FEMALE GRACE 36 53\n', '2011 FEMALE GUADALUPE 26 62\n', '2011 FEMALE HAILEY 126 8\n', '2011 FEMALE HALEY 14 74\n', '2011 FEMALE HANNAH 17 71\n']


### How Dataframe can be assigned

In [32]:
#Let's make some data
columns = ['a', 'b', 'c']
data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df = pd.DataFrame(data=data, columns=columns)
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
#Or you can add more data to df
df['d']=[10,11,12]
df

In [None]:
#Now observe the df again
lines[0:10]

In [33]:
# the first three column is Year_of_Birth Gender Ethnicity, content is split by space
# we can use a python build in function called split, by default split by space, but you can assign the separator by split(",")
sent = 'This is a sentence sperate by space.'
print(f'We use default split function for "{sent}", \nresult is {sent.split()}')
sent_comma = 'Another sentence with apple,banana,cat'
print(f'\nWe use comma as seperator for "{sent_comma}", \nresult is {sent_comma.split(",")}')


We use default split function for "This is a sentence sperate by space.", 
result is ['This', 'is', 'a', 'sentence', 'sperate', 'by', 'space.']

We use comma as seperator for "Another sentence with apple,banana,cat", 
result is ['Another sentence with apple', 'banana', 'cat']


In [None]:
# So we can handle space separate word by split
columns=lines[0].split()
print(f'This will be the columns names: {columns}')


In [None]:
#Now handling the content, in list, if you want to add something into list, you can use append
content = []
for line in lines[1:]:
    content.append(line.split())
    
print(f'Check the first 10 rows of the content: {content[0:10]}')

In [None]:
#Now we can assign both contenct and columns to df
df = pd.DataFrame(data=content, columns=columns)
print(df.head())

In [None]:
#that's great, now we try to write this to a function for future use
import pandas as pd
def to_df(lines:list)->pd.DataFrame:
    columns=lines[0].split()
    content = []
    for line in lines[1:]:
        content.append(line.split())
    df = pd.DataFrame(data=content, columns=columns)
    return df

In [None]:
# try again, and you'll get same result
data = to_df(lines)
data.head()

### If format is some wording separate inside?

In [None]:
#What if there's some wording separate inside?
"""
Below is Data of Year 2011
============================================================
Year_of_Birth Gender Child's_First_Name Count Rank
2011 FEMALE GERALDINE 13 75
2011 MALE VINCENT 71 13

Below is Data of Year 2012
============================================================
Year_of_Birth Gender Child's_First_Name Count Rank
2012 FEMALE EMERSON 11 82
2012 FEMALE VICKY 25 30
"""
with open('Popular_Baby_Names_space_spl_year.txt') as f:
    lines=f.readlines()
lines[0:10]

In [None]:
#In this case, observing data, you'll see there's a special character '=' inside
#So we want to get location of each '=' in list
location = []
for idx in range(len(lines)):
    if '=' in lines[idx]:
        location.append(idx)
print(f'Location of each "=" is: {location})')
    

In [None]:
## so we know if we want to get header, will be each location +1, and we want to extract the data we want
start = location[0]+1
end = location[1]-2
print(f'first loc is {start} and end will be {end}')
print(f'check it: Start {lines[start]}, end: {lines[end-1]}')

In [None]:
#So we can put lines[start:end] to previous function
to_df(lines[start:end]).head()


In [None]:
#Because the data is in a split format, we'll need a append function to merge together
columns1 = ['a', 'b', 'c']
data1 = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df1 = pd.DataFrame(data=data1, columns=columns1)

columns2 = ['a', 'b', 'c']
data2 = [[10, 11, 12], [13, 14, 15], [16, 17, 18]]
df2 = pd.DataFrame(data=data2, columns=columns2)

print(f'df1:\n{df1}')
print(f'df2:\n{df2}')

#Now we can append together
df = pd.concat([df1, df2], ignore_index=True,)
print(f'After merge, df:\n{df}')

In [None]:
#Observing below, you'll see df is printed by year
for idx in range(len(location)):
    if idx>=len(location)-1:
        start = location[idx]+1
        end = None
    else:
        start = location[idx]+1
        end = location[idx+1]-2
    print(to_df(lines[start:end]).head())

In [None]:
#Now we want to save this to a df for storage, and you had a complete dataframe
df=pd.DataFrame()
for idx in range(len(location)):
    if idx>=len(location)-1:
        start = location[idx]+1
        end = None
    else:
        start = location[idx]+1
        end = location[idx+1]-2
    tmp = to_df(lines[start:end])
    df = pd.concat([df, tmp], ignore_index=True,)
df.head()

### If data we had is a aligned format

In [None]:
#What if there's some tab inside?
"""
Year_of_Birth   Gender      Child's_First_Name  Count   Rank
2011            FEMALE      GERALDINE           13      75
2011            MALE        VINCENT             71      13
2011            MALE        VICTOR              17      49
2011            MALE        TYLER               31      35
"""

with open('Popular_Baby_Names_space_tab_lign.txt') as f:
    lines=f.readlines()
lines[0:10]

In [None]:
#Previous function can also apply
df = to_df(lines)
df.head()

### Save file

In [None]:
#Once you had a dataframe, you can perform many caculation ot join, we'll introduce in afterward, now you can save df to a csv
df.to_csv('df.csv', index=False)