In [15]:
import pandas as pd

In [16]:
#series - 1 dimensional
colours = pd.Series(["red", "blue", "green"])
colours

0      red
1     blue
2    green
dtype: object

In [17]:
#another series 
makes = pd.Series(["BMW", "Toyota", "Honda"])

In [28]:
#dataframe is 2 dimensional
#dataframe can be made from combining or including multiple series'

car_data = pd.DataFrame({"Car mKe": makes, "Colour": colours})
car_data

Unnamed: 0,Car mKe,Colour
0,BMW,red
1,Toyota,blue
2,Honda,green


In [29]:
#instead of writing all of our data, we can import it
#pd.read_csv saves data into a dataframe or series

car_sales = pd.read_csv("car-sales.csv")
car_sales

#NOTE - you can also read data directly from a link; example:
#heart_disease = pd.read_csv("https://link-to-data/data/heart-disease.csv")

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [31]:
#NOTE - axis 0 = row or x axis, axis 1 = column, or y axis.

#Exporting dataframe to a .csv file
car_sales.to_csv("exported-car-sales.csv")

## Describing the data


In [33]:
#dtypes attribute shows information about the data
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [36]:
# .columns returns a list of column names
car_sales.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [38]:
#.index returns the range of the indices, starts at 0, stops at 10
car_sales.index


RangeIndex(start=0, stop=10, step=1)

In [41]:
# .describe() returns statistical information about dataframe
car_sales.describe()
#notice how price is not on here, - because price has $ infront of number, making it an object/string instead of number


Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [43]:
# .info similar to index + dtypes
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes


In [45]:
#.mean() returns average of all numerical data
car_sales.mean()

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [46]:
car_prices = pd.Series([3000,1500,111250])
#returns mean of the 3 columns
car_prices.mean()

38583.333333333336

In [48]:
#returns sum of doors in the doors column
car_sales["Doors"].sum()

40

In [49]:
#len() returns the length of the data
len(car_sales)

10

## Viewing and selecting Data

In [50]:
# .head() returns the first few rows of data
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [51]:
#.tail() returns the last few rows of data
car_sales.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [52]:
# .loc & .iloc
animals = pd.Series(["cat", "dog", "bird", "panda", "snake"],
                   index=[0,3,8,1,4])
animals

0      cat
3      dog
8     bird
1    panda
4    snake
dtype: object

In [53]:
# loc refers to the data of the index you choose
animals.loc[3]

'dog'

In [57]:
animals

0      cat
3      dog
8     bird
1    panda
4    snake
dtype: object

In [58]:
#iloc refers to data in position, instead of index
animals.iloc[3]

'panda'

In [60]:
#you can split dataframes using loc and iloc
#below grabs data in the first 3 positions of the dataframe/series
animals.iloc[:3]

0     cat
3     dog
8    bird
dtype: object

In [62]:
#you can select specific columns in a dataframe
car_sales["Make"]
#below is the same
#car_sales.Make

0    Toyota
1     Honda
2    Toyota
3       BMW
4    Nissan
5    Toyota
6     Honda
7     Honda
8    Toyota
9    Nissan
Name: Make, dtype: object

In [64]:
#you can use boolean logic to return only certain rows or columns
#similar to python array comprehension
#returns all makes of Toyota type
car_sales[car_sales["Make"] == "Toyota"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
8,Toyota,White,60000,4,"$6,250.00"
