## How to import data

In [51]:
#for a csv, comma separated values you can use pandas
import pandas as pd
data = pd.read_csv('Popular_Baby_Names.csv')
data.head()

Unnamed: 0,Year of Birth,Gender,Child's First Name,Count,Rank
0,2011,FEMALE,GERALDINE,13,75
1,2011,MALE,VINCENT,71,13
2,2011,MALE,VICTOR,17,49
3,2011,MALE,TYLER,31,35
4,2011,MALE,TRAVIS,11,55


In [52]:
#check the raw format inside a csv
with open('Popular_Baby_Names.csv') as f:
    lines=f.readlines()
lines[:10]

["Year of Birth,Gender,Child's First Name,Count,Rank\n",
 '2011,FEMALE,GERALDINE,13,75\n',
 '2011,MALE,VINCENT,71,13\n',
 '2011,MALE,VICTOR,17,49\n',
 '2011,MALE,TYLER,31,35\n',
 '2011,MALE,TRAVIS,11,55\n',
 '2011,MALE,TONY,16,50\n',
 '2011,MALE,TIMOTHY,16,50\n',
 '2011,MALE,THOMAS,18,48\n',
 '2011,MALE,TERRY,11,55\n']

##Different raw data format, our target is to transit needed data to pd acceptable format

In [None]:
## If you want to read files, you can use below format
with open("Your file name") as f:   #change file name
    content = f.readlines()         #read file content to content
print(type(content))                #print type of content
print(content[0:10])                #print content, from row 0~10

In [67]:
#If there's space separate
#check the raw format inside a csv
with open('Popular_Baby_Names_space.txt') as f:
    lines=f.readlines()
lines[0:10]

["Year_of_Birth Gender Child's_First_Name Count Rank\n",
 '2011 FEMALE GERALDINE 13 75\n',
 '2011 FEMALE GIA 21 67\n',
 '2011 FEMALE GIANNA 49 42\n',
 '2011 FEMALE GISELLE 38 51\n',
 '2011 FEMALE GRACE 36 53\n',
 '2011 FEMALE GUADALUPE 26 62\n',
 '2011 FEMALE HAILEY 126 8\n',
 '2011 FEMALE HALEY 14 74\n',
 '2011 FEMALE HANNAH 17 71\n']

### How Dataframe can be assigned

In [68]:
#Let's make some data
columns = ['a', 'b', 'c']
data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df = pd.DataFrame(data=data, columns=columns)
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [69]:
#Or you can add more data to df
df['d']=[10,11,12]
df

Unnamed: 0,a,b,c,d
0,1,2,3,10
1,4,5,6,11
2,7,8,9,12


In [70]:
#Now observe the df again
lines[0:10]

["Year_of_Birth Gender Child's_First_Name Count Rank\n",
 '2011 FEMALE GERALDINE 13 75\n',
 '2011 FEMALE GIA 21 67\n',
 '2011 FEMALE GIANNA 49 42\n',
 '2011 FEMALE GISELLE 38 51\n',
 '2011 FEMALE GRACE 36 53\n',
 '2011 FEMALE GUADALUPE 26 62\n',
 '2011 FEMALE HAILEY 126 8\n',
 '2011 FEMALE HALEY 14 74\n',
 '2011 FEMALE HANNAH 17 71\n']

In [71]:
# the first three column is Year_of_Birth Gender Ethnicity, content is split by space
# we can use a python build in function called split, by default split by space, but you can assign the separator by split(",")
sent = 'This is a sentence sperate by space.'
print(sent)
print(f'We use default split function, the result is {sent.split()}')
sent_comma = 'Another sentence with apple,banana,cat'
print(sent_comma)
print(f'We use comma as seperator, the result is {sent_comma.split(",")}')


This is a sentence sperate by space.
We use default split function, the result is ['This', 'is', 'a', 'sentence', 'sperate', 'by', 'space.']
Another sentence with apple,banana,cat
We use comma as seperator, the result is ['Another sentence with apple', 'banana', 'cat']


In [72]:
# So we can handle space separate word by split, now we only handle first 3 column, so choose from 0~2
columns=lines[0].split()
print(f'This will be the columns names: {columns}')


This will be the columns names: ['Year_of_Birth', 'Gender', "Child's_First_Name", 'Count', 'Rank']


In [73]:
#Now handling the content, in list, if you want to add something into list, you can use append
content = []
for line in lines[1:]:
    content.append(line.split())
    
print(f'Check the first 10 rows of the content: {content[0:10]}')

Check the first 10 rows of the content: [['2011', 'FEMALE', 'GERALDINE', '13', '75'], ['2011', 'FEMALE', 'GIA', '21', '67'], ['2011', 'FEMALE', 'GIANNA', '49', '42'], ['2011', 'FEMALE', 'GISELLE', '38', '51'], ['2011', 'FEMALE', 'GRACE', '36', '53'], ['2011', 'FEMALE', 'GUADALUPE', '26', '62'], ['2011', 'FEMALE', 'HAILEY', '126', '8'], ['2011', 'FEMALE', 'HALEY', '14', '74'], ['2011', 'FEMALE', 'HANNAH', '17', '71'], ['2011', 'FEMALE', 'HAYLEE', '17', '71']]


In [74]:
[x for x in content if len(x)>5]

[]

In [75]:
#Now we can assign both contenct and columns to df
df = pd.DataFrame(data=content, columns=columns)
print(df.head())

  Year_of_Birth  Gender Child's_First_Name Count Rank
0          2011  FEMALE          GERALDINE    13   75
1          2011  FEMALE                GIA    21   67
2          2011  FEMALE             GIANNA    49   42
3          2011  FEMALE            GISELLE    38   51
4          2011  FEMALE              GRACE    36   53


In [76]:
#that's great, now we try to write this to a function for future use
import pandas as pd
def to_df(lines:list)->pd.DataFrame:
    columns=lines[0].split()
    content = []
    for line in lines[1:]:
        content.append(line.split())
    df = pd.DataFrame(data=content, columns=columns)
    return df

In [78]:
# try again, and you'll get same result
data = to_df(lines)
data.head()

Unnamed: 0,Year_of_Birth,Gender,Child's_First_Name,Count,Rank
0,2011,FEMALE,GERALDINE,13,75
1,2011,FEMALE,GIA,21,67
2,2011,FEMALE,GIANNA,49,42
3,2011,FEMALE,GISELLE,38,51
4,2011,FEMALE,GRACE,36,53


In [79]:
#What if there's some wording separate inside?
"""
Below is Data of Year 2011
============================================================
Year_of_Birth Gender Child's_First_Name Count Rank
2011 FEMALE GERALDINE 13 75
2011 MALE VINCENT 71 13

Below is Data of Year 2012
============================================================
Year_of_Birth Gender Child's_First_Name Count Rank
2012 FEMALE EMERSON 11 82
2012 FEMALE VICKY 25 30
"""
with open('Popular_Baby_Names_space_spl_year.txt') as f:
    lines=f.readlines()
lines[0:10]

['Below is Data of Year 2011\n',
 "Year_of_Birth Gender Child's_First_Name Count Rank\n",
 '2011 FEMALE GERALDINE 13 75\n',
 '2011 MALE VINCENT 71 13\n',
 '2011 MALE VICTOR 17 49\n',
 '2011 MALE TYLER 31 35\n',
 '2011 MALE TRAVIS 11 55\n',
 '2011 MALE TONY 16 50\n',
 '2011 MALE TIMOTHY 16 50\n']

In [83]:
#In this case, observing data, you'll see there's a special character '=' inside
#So we want to get location of each '=' in list


1