In [20]:
# learn to use pandas, how to import pandas, and theory. 

import pandas as pd


In [21]:
df = pd.read_csv('names/yob1983.txt', names=['name','sex', 'birthcnt'])

In [22]:
df

Unnamed: 0,name,sex,birthcnt
0,Jennifer,F,54342
1,Jessica,F,45282
2,Amanda,F,33753
3,Ashley,F,33293
4,Sarah,F,27231
5,Melissa,F,23473
6,Nicole,F,22397
7,Stephanie,F,22327
8,Heather,F,20747
9,Elizabeth,F,19843


In [36]:
df['birthcnt'].sum()

3462826

In [23]:
def get_initial(s):
    return s[0]

df['initial'] = df['name'].apply(get_initial)
df

Unnamed: 0,name,sex,birthcnt,initial
0,Jennifer,F,54342,J
1,Jessica,F,45282,J
2,Amanda,F,33753,A
3,Ashley,F,33293,A
4,Sarah,F,27231,S
5,Melissa,F,23473,M
6,Nicole,F,22397,N
7,Stephanie,F,22327,S
8,Heather,F,20747,H
9,Elizabeth,F,19843,E


In [25]:
# to check how many M & F are in the data

df['sex'].value_counts()

F    12065
M     7338
Name: sex, dtype: int64

In [31]:
# now the same values as % with normalize=True

df['sex'].value_counts(normalize=True)

F    0.621811
M    0.378189
Name: sex, dtype: float64

In [33]:
df.birthcnt.mean()

178.4685873318559

In [34]:
df.groupby('sex').birthcnt.mean()

sex
F    138.447659
M    244.270237
Name: birthcnt, dtype: float64

In [35]:
df.groupby(['sex', 'initial']).birthcnt.mean()

sex  initial
F    A          219.429695
     B          138.230563
     C          144.002994
     D           82.292818
     E          236.558739
     F           49.269231
     G           67.140909
     H          267.657895
     I           42.614754
     J          219.456916
     K          151.079511
     L          117.015656
     M          163.508368
     N          125.209026
     O           33.794521
     P          100.936416
     Q           16.566038
     R          121.825312
     S          113.601732
     T           83.406863
     U           23.846154
     V          161.720497
     W           99.561404
     X           19.384615
     Y           45.537815
     Z           19.033898
M    A          227.590835
     B          352.325077
     C          299.754132
     D          227.471212
     E          192.311419
     F          117.753731
     G          163.288000
     H           65.474227
     I          118.941860
     J          496.398374
     K         

# How do I read a tabular data file into pandas?


In [37]:
# import pandas. 

import pandas as pd

In [41]:
# normally we have to use the path where we have the files that we need to work with.

pd.read_table('http://bit.ly/chiporders')

# save it as DataFrame:

orders = pd.read_table('http://bit.ly/chiporders')

In [43]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [44]:
# check another example with a wrong table and how to fixed!

pd.read_table('http://bit.ly/movieusers')

# we can see in the table that everything is actually in one column
# we need to communicate to pandas and tell that use this pipe separator

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101
5,7|57|M|administrator|91344
6,8|36|M|administrator|05201
7,9|29|M|student|01002
8,10|53|M|lawyer|90703
9,11|39|F|other|30329


In [46]:
# how to fix a table in pandas? 
# How do I tell pandas to read the table with the correct separation?

# we need to tell pandas that the caracter '|' is our separator

pd.read_table('http://bit.ly/movieusers', sep='|')

# now each of the fields are in their own column.

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067
2,4,24,M,technician,43537
3,5,33,F,other,15213
4,6,42,M,executive,98101
5,7,57,M,administrator,91344
6,8,36,M,administrator,05201
7,9,29,M,student,01002
8,10,53,M,lawyer,90703
9,11,39,F,other,30329


In [47]:
# we have another problem the first row is being interpreted as the header row
# and that is incorrect because it is a data row infact the first one.
# we nned to tell pandas that there is no header

pd.read_table('http://bit.ly/movieusers', sep='|', header=None)

# It will replace just the first row with integers but the data will be in the data section


Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,05201
8,9,29,M,student,01002
9,10,53,M,lawyer,90703


In [49]:
# we probably want to have column names, so we can define them
# We need to create a python list called "user_cols" there are strings 
# and happen to know what each of these columns respresent 


user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
# once we have created the names of the columns we can added it next to "header=None"
# with the name of the list "user_cols"

# pd.read_table('http://bit.ly/movieusers', sep='|', header=None) to -->
pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols)

# as you can see once you run it you will get the list with the different names in the hearder
# and it is ready for as to work with

# now we can save it as DataFrame object like :

users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols)

# from now on every time that we want to check it, we can just print it out
# using the name "users".

In [50]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [51]:
users.tail()

Unnamed: 0,user_id,age,gender,occupation,zip_code
938,939,26,F,student,33319
939,940,32,M,administrator,2215
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841


# How do I select a pandas Series from a DataDrame

In [53]:

#there are two basic object types in pandas that hold data and one is called
# the DATAFRAME and it is basically just a table of rows and columns and then each of those
# columns is known as a pandas series.

# YOU CAN HAVE A PANDA SERIES THAT IS NOT PART OF A DATAFRAME but mostly
# we work with series that are part of a data frame.

#The question is how do I select a series from it perhaps to do an analysis 
#on a particular series or to manipulate that series 

