# Storing data

In [25]:
dictonary = {"independent" : [1,2,3,4,5,6,7,8,9], "dependent" : [10,11,12,13,14,15,16,17,18,19]}

print(dictonary["independent"])

[1, 2, 3, 4, 5, 6, 7, 8, 9]


# Pandas

In [26]:
import pandas as pd

# DataFrames

First we can get data either by generating it or by reading it

In [27]:
data = pd.read_csv("data.csv") # This will read in a locally stored file called `data.csv`

# data = pd.DataFrame() # Used when you are generating data to use, there are a lot of different functions and methods for reading in data see the docs for more information 

The returned object now stored in `data` is a `DataFrame` - This is panda's equivalent of an excel sheet.

In [28]:
type(data)

pandas.core.frame.DataFrame

In [29]:
data # Juypter will display the frame for us

Unnamed: 0,x,y,z
0,123.154675,86.868598,a
1,83.966671,71.911116,a
2,86.307842,100.100997,a
3,135.927615,102.541512,a
4,147.046954,74.312553,a
...,...,...,...
107,554.121345,440.471173,a
108,540.367018,457.320256,a
109,573.524085,403.506937,a
110,523.626257,415.569872,a


A DataFrame has built in functions we can call to give us information about the data or technical information on itself

In [30]:
data.describe()

Unnamed: 0,x,y
count,112.0,112.0
mean,364.955075,287.293602
std,134.35807,114.785251
min,83.966671,66.526716
25%,284.300521,182.58759
50%,378.083601,302.85611
75%,478.875195,386.486757
max,604.143915,468.087672


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       112 non-null    float64
 1   y       112 non-null    float64
 2   z       112 non-null    object 
dtypes: float64(2), object(1)
memory usage: 2.8+ KB


https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html

# Selecting columns

A DataFrame is made of multiple Series. We access a series like we would with a dictonary, with the column name as the key

In [32]:
data["z"]

0      a
1      a
2      a
3      a
4      a
      ..
107    a
108    a
109    a
110    a
111    a
Name: z, Length: 112, dtype: object

# Deleting/Dropping columns

We can delete columns completely or we can create a new dataframe with the columns missing

In [33]:
data.drop(columns="x") # We can also do index

Unnamed: 0,y,z
0,86.868598,a
1,71.911116,a
2,100.100997,a
3,102.541512,a
4,74.312553,a
...,...,...
107,440.471173,a
108,457.320256,a
109,403.506937,a
110,415.569872,a


In [34]:
# delete a column from memory

data = data.drop(columns="z")

# del data["z"] # this line will also work
# data.drop(columns="z", inplace=True) # as well as this one

data

Unnamed: 0,x,y
0,123.154675,86.868598
1,83.966671,71.911116
2,86.307842,100.100997
3,135.927615,102.541512
4,147.046954,74.312553
...,...,...
107,554.121345,440.471173
108,540.367018,457.320256
109,573.524085,403.506937
110,523.626257,415.569872


# Selecting specific rows

In [43]:
data.loc[4]

x    147.046954
y     74.312553
Name: 4, dtype: float64

In [46]:
pokemon = pd.DataFrame([[1, 2], [4, 5], [7, 8]], index=['cobra', 'viper', 'sidewinder'],columns=['max_speed', 'shield'])

pokemon.loc["cobra"]

max_speed    1
shield       2
Name: cobra, dtype: int64

`.loc` takes a key

single intergers treated as an index position

A truth table is interpeted as a mask

In [36]:
data.loc[[4,5]]

Unnamed: 0,x,y
4,147.046954,74.312553
5,139.894397,113.189774


In [37]:
pokemon.loc["viper"]

max_speed    4
shield       5
Name: viper, dtype: int64

Selecting specific rows + columns

In [38]:
data["x"].loc[4]

147.04695363855967

In [39]:
pokemon.loc["viper","max_speed"]

4

# Filtering and masks

In [53]:
filt = (data["x"] ==  300) # using 300 as it's the average value as seen above

filt

0       True
1       True
2       True
3       True
4       True
       ...  
107    False
108    False
109    False
110    False
111    False
Name: x, Length: 112, dtype: bool

In [41]:
data.loc[filt]

Unnamed: 0,x,y
27,304.835214,183.453381
29,311.548819,169.081712
32,314.818915,223.370782
36,341.991323,218.808220
37,324.141717,260.655199
...,...,...
107,554.121345,440.471173
108,540.367018,457.320256
109,573.524085,403.506937
110,523.626257,415.569872


# Correlation

In [42]:
data.corr()

Unnamed: 0,x,y
x,1.0,0.953712
y,0.953712,1.0
