In [1]:
import pandas as pd
#Pandas is the library we are going to use to manage our data

In [2]:
l = [1,2,3,4,5,6]
#In regular python we have a list

In [3]:
#In pandas we have a series
#The numbers on the left are the index (the identifiers for each value)
#The numbers on the right are our values
s = pd.Series(l)
print(s)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [4]:
#We could other things for an index, however
#The default is a range starting at 0
s = pd.Series(l,index=["a","b","c","d","e","f"])
print(s)
#The index argument lets us set the index for our series

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64


In [5]:
s = s**2
#Pandas allows for application of math directly to our data. Here, we can square each value
print(s)

a     1
b     4
c     9
d    16
e    25
f    36
dtype: int64


In [6]:
print(s.values)
#The values attribute of a series gives us back a numpy array (which we will cover more in these lessons)

[ 1  4  9 16 25 36]


In [7]:
data = []
data.append(("Ray Lewis","6'1","250","Defense"))
data.append(("Tom Brady","6'4","225","Offense"))
data.append(("Julio Jones","6'3","220","Offense"))
data.append(("Richard Sherman","6'3","194","Defense"))
print(data)
#Now, let's create some data, each element corresponds to a row
#Each tuple corresponds to the data for the row

[('Ray Lewis', "6'1", '250', 'Defense'), ('Tom Brady', "6'4", '225', 'Offense'), ('Julio Jones', "6'3", '220', 'Offense'), ('Richard Sherman', "6'3", '194', 'Defense')]


In [8]:
df = pd.DataFrame(data)
#We can initialize a dataframe with a list of data
print(df)

                 0    1    2        3
0        Ray Lewis  6'1  250  Defense
1        Tom Brady  6'4  225  Offense
2      Julio Jones  6'3  220  Offense
3  Richard Sherman  6'3  194  Defense


In [9]:
df = pd.DataFrame(data,columns=["Name","Height","Weight","Type"])
#The columns argument lets us set the names for the columns in our dataframe
print(df)

              Name Height Weight     Type
0        Ray Lewis    6'1    250  Defense
1        Tom Brady    6'4    225  Offense
2      Julio Jones    6'3    220  Offense
3  Richard Sherman    6'3    194  Defense


In [10]:
print(df["Name"])
#To get a specific column, we index with the column name.

0          Ray Lewis
1          Tom Brady
2        Julio Jones
3    Richard Sherman
Name: Name, dtype: object


In [11]:
print(df["Height"])

0    6'1
1    6'4
2    6'3
3    6'3
Name: Height, dtype: object


In [12]:
print(df[["Name","Weight"]])
#If we give a nested list where the first element is a list of names, we can select multiple columns

              Name Weight
0        Ray Lewis    250
1        Tom Brady    225
2      Julio Jones    220
3  Richard Sherman    194


In [13]:
df["Retired"] = [True,False,False,False]
#We can also assign a new column by the above method if we give a list of equal length as the dataframes rows
print(df)

              Name Height Weight     Type  Retired
0        Ray Lewis    6'1    250  Defense     True
1        Tom Brady    6'4    225  Offense    False
2      Julio Jones    6'3    220  Offense    False
3  Richard Sherman    6'3    194  Defense    False


In [14]:
#Right now, we have height as a string representation. Let's crate a function to convert it to inches
def heightConvert(x):
    x = x.split("'")
    #Split the string into the feet and inches part
    ft = int(x[0])
    #Assign the variable ft the value of the left side as an integer instead of the string
    inches = int(x[1])
    #We do the same for inches
    return ft*12+inches

In [15]:
#It works!
heightConvert("6'1")

73

In [16]:
#We can use the apply() function to apply a function to every element of a series or dataframe
df["Height"].apply(heightConvert)

0    73
1    76
2    75
3    75
Name: Height, dtype: int64

In [17]:
df["Height"] = df["Height"].apply(heightConvert)
#Let's assign the inches value over our current height column
print(df)

              Name  Height Weight     Type  Retired
0        Ray Lewis      73    250  Defense     True
1        Tom Brady      76    225  Offense    False
2      Julio Jones      75    220  Offense    False
3  Richard Sherman      75    194  Defense    False


In [18]:
#If we try to divide here we will run into an issue, weight is still a string representation
print(df["Weight"]/df["Height"])

TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [19]:
#The dtypes attribute lets us see the attribute
print(df.dtypes)

Name       object
Height      int64
Weight     object
Type       object
Retired      bool
dtype: object


In [20]:
df["Weight"] = pd.to_numeric(df["Weight"])
#The pd.to_numeric function converts pandas data to numbers, look at the dtypes after it is executed
print(df.dtypes)

Name       object
Height      int64
Weight      int64
Type       object
Retired      bool
dtype: object


In [21]:
print(df["Weight"]/df["Height"])
#We can also use math involving columns. Pandas applies the division above for each row so that each row's weight value
#is divided by the height value of the row

0    3.424658
1    2.960526
2    2.933333
3    2.586667
dtype: float64


In [22]:
print(df.loc[0])
#We can use loc to find a row where the argument is the index

Name       Ray Lewis
Height            73
Weight           250
Type         Defense
Retired         True
Name: 0, dtype: object


In [23]:
print(df.loc[0,"Height"])
#Adding a second argument in will let us index both the row and column

73


In [24]:
print(df.loc[:,"Height"])
#if we wanted every row, but only the height column we would use : to say "all rows" and then height in the second argument 

0    73
1    76
2    75
3    75
Name: Height, dtype: int64


In [25]:
print(df.loc[:1,"Height"])
#This would return the height column and only the first two rows

0    73
1    76
Name: Height, dtype: int64


In [26]:
print(df.loc[:1,["Height","Weight"]])
#This would return only the first two rows and the height and weight columns

   Height  Weight
0      73     250
1      76     225


In [27]:
print(df[[True,False,True,False]])
#If we give pandas an index of Trues and Falses, we can filter the data. Only True rows are shown
#Row 1 has true, so its shown, but row 2 has false so it is filtered

          Name  Height  Weight     Type  Retired
0    Ray Lewis      73     250  Defense     True
2  Julio Jones      75     220  Offense    False


In [28]:
print(df["Weight"]>220)
#We can also check the truth of a statement such as which rows have weight values over 220

0     True
1     True
2    False
3    False
Name: Weight, dtype: bool


In [29]:
print(df[df["Weight"]>220])
#And this allows us to filter based on an argument. In this case we can print only rows with weights over 220

        Name  Height  Weight     Type  Retired
0  Ray Lewis      73     250  Defense     True
1  Tom Brady      76     225  Offense    False


In [30]:
data = []
data.append(("Allen Robinson",75,250,"Offense",False))
data.append(("Alvin Kamara",None,215,"Offense",False))
data.append(("Christian McCaffrey",71,None,"Offense",False))
print(data)
#Let's create a second set of data

[('Allen Robinson', 75, 250, 'Offense', False), ('Alvin Kamara', None, 215, 'Offense', False), ('Christian McCaffrey', 71, None, 'Offense', False)]


In [31]:
df2 = pd.DataFrame(data,columns=["Name","Height","Weight","Type","Retired"])
print(df2)
#And turn it into a second dataframe
#You'll notice some data is missing, this happens commonly working with real data

                  Name  Height  Weight     Type  Retired
0       Allen Robinson    75.0   250.0  Offense    False
1         Alvin Kamara     NaN   215.0  Offense    False
2  Christian McCaffrey    71.0     NaN  Offense    False


In [32]:
dfFinal = pd.concat([df,df2])
print(dfFinal)
#The pd.concat() function lets us put together two dataframes

                  Name  Height  Weight     Type  Retired
0            Ray Lewis    73.0   250.0  Defense     True
1            Tom Brady    76.0   225.0  Offense    False
2          Julio Jones    75.0   220.0  Offense    False
3      Richard Sherman    75.0   194.0  Defense    False
0       Allen Robinson    75.0   250.0  Offense    False
1         Alvin Kamara     NaN   215.0  Offense    False
2  Christian McCaffrey    71.0     NaN  Offense    False


In [33]:
#The two dataframes have indexes that overlap! We could fix this by resetting the index, as so....
print(dfFinal.reset_index())

   index                 Name  Height  Weight     Type  Retired
0      0            Ray Lewis    73.0   250.0  Defense     True
1      1            Tom Brady    76.0   225.0  Offense    False
2      2          Julio Jones    75.0   220.0  Offense    False
3      3      Richard Sherman    75.0   194.0  Defense    False
4      0       Allen Robinson    75.0   250.0  Offense    False
5      1         Alvin Kamara     NaN   215.0  Offense    False
6      2  Christian McCaffrey    71.0     NaN  Offense    False


In [34]:
#Or we can give the argument ignore_index=True to reset it during the concat function
dfFinal = pd.concat([df,df2],ignore_index=True)
print(dfFinal)

                  Name  Height  Weight     Type  Retired
0            Ray Lewis    73.0   250.0  Defense     True
1            Tom Brady    76.0   225.0  Offense    False
2          Julio Jones    75.0   220.0  Offense    False
3      Richard Sherman    75.0   194.0  Defense    False
4       Allen Robinson    75.0   250.0  Offense    False
5         Alvin Kamara     NaN   215.0  Offense    False
6  Christian McCaffrey    71.0     NaN  Offense    False


In [35]:
#The dropna() function gets rid of any rows with missing values
print(dfFinal.dropna())

              Name  Height  Weight     Type  Retired
0        Ray Lewis    73.0   250.0  Defense     True
1        Tom Brady    76.0   225.0  Offense    False
2      Julio Jones    75.0   220.0  Offense    False
3  Richard Sherman    75.0   194.0  Defense    False
4   Allen Robinson    75.0   250.0  Offense    False


In [36]:
print(dfFinal.dropna(subset=["Height","Type"]))
#If given an argument subset, we can drop only rows with missing values from the subset 

                  Name  Height  Weight     Type  Retired
0            Ray Lewis    73.0   250.0  Defense     True
1            Tom Brady    76.0   225.0  Offense    False
2          Julio Jones    75.0   220.0  Offense    False
3      Richard Sherman    75.0   194.0  Defense    False
4       Allen Robinson    75.0   250.0  Offense    False
6  Christian McCaffrey    71.0     NaN  Offense    False


In [37]:
dfFinal = dfFinal.set_index("Name")
#Setting the index to name changes it from a column to index
print(dfFinal)

                     Height  Weight     Type  Retired
Name                                                 
Ray Lewis              73.0   250.0  Defense     True
Tom Brady              76.0   225.0  Offense    False
Julio Jones            75.0   220.0  Offense    False
Richard Sherman        75.0   194.0  Defense    False
Allen Robinson         75.0   250.0  Offense    False
Alvin Kamara            NaN   215.0  Offense    False
Christian McCaffrey    71.0     NaN  Offense    False


In [38]:
dfFinal.dropna(inplace=True,subset=["Height","Type"])
print(dfFinal)
#If we use inplace=True then the dropna function happens in place

                     Height  Weight     Type  Retired
Name                                                 
Ray Lewis              73.0   250.0  Defense     True
Tom Brady              76.0   225.0  Offense    False
Julio Jones            75.0   220.0  Offense    False
Richard Sherman        75.0   194.0  Defense    False
Allen Robinson         75.0   250.0  Offense    False
Christian McCaffrey    71.0     NaN  Offense    False


In [39]:
dfFinal.loc["Christian McCaffrey","Weight"] = 205
#Using loc we can overwrite data
print(dfFinal)

                     Height  Weight     Type  Retired
Name                                                 
Ray Lewis              73.0   250.0  Defense     True
Tom Brady              76.0   225.0  Offense    False
Julio Jones            75.0   220.0  Offense    False
Richard Sherman        75.0   194.0  Defense    False
Allen Robinson         75.0   250.0  Offense    False
Christian McCaffrey    71.0   205.0  Offense    False
