In [1]:
import numpy as np
import pandas as pd

In [2]:
x = pd.Series([10,20,30,40,50])

In [3]:
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
x.dtype

dtype('int64')

In [5]:
data = [120,680,870]
# index = ['Don', 'mike', 'edwin']
index = [2010, 2020, 2030]

sales = pd.Series(data, index)
sales.index.name = "Year"

In [6]:
sales

Year
2010    120
2020    680
2030    870
dtype: int64

In [7]:
sales.index

Int64Index([2010, 2020, 2030], dtype='int64', name='Year')

In [8]:
sales[2010]


120

In [9]:
sales.loc[2010]

120

In [10]:
sales.iloc[0]

120

In [11]:
sales

Year
2010    120
2020    680
2030    870
dtype: int64

In [12]:
sales > 500

Year
2010    False
2020     True
2030     True
dtype: bool

In [13]:
sales[sales > 500]

Year
2020    680
2030    870
dtype: int64

In [14]:
sales.iloc[2]

870

In [15]:
sales[sales > 500].index

Int64Index([2020, 2030], dtype='int64', name='Year')

In [16]:
sales[sales > 500].values

array([680, 870], dtype=int64)

In [17]:
870 in sales

False

In [18]:
# change to dict
sales.to_dict()

{2010: 120, 2020: 680, 2030: 870}

In [19]:
sales_dict = {
    "Don": 534,
    "Mike":453,
    "Edwin": 412
}

sales_ser = pd.Series(sales_dict)
sales_ser

Don      534
Mike     453
Edwin    412
dtype: int64

In [20]:
new_sales = pd.Series(sales_dict, index=['Don', 'Mike', 'sally', 'Edwin'])

In [21]:
new_sales

Don      534.0
Mike     453.0
sally      NaN
Edwin    412.0
dtype: float64

In [22]:
np.isnan(new_sales)

Don      False
Mike     False
sally     True
Edwin    False
dtype: bool

In [23]:
pd.isnull(new_sales)

Don      False
Mike     False
sally     True
Edwin    False
dtype: bool

In [24]:
new_sales.loc['sally']

nan

In [25]:
new_sales.loc['sally'] = 548

In [26]:
new_sales.loc['sally']

548.0

In [27]:
new_sales.name = "Total tv sales"

In [28]:
new_sales

Don      534.0
Mike     453.0
sally    548.0
Edwin    412.0
Name: Total tv sales, dtype: float64

### Data frame

- Two dimensional 
- size-mutable
- heterpgenous
- rows and columns (records and series)


In [29]:
sales_df = pd.DataFrame(new_sales)

In [30]:
sales_df

Unnamed: 0,Total tv sales
Don,534.0
Mike,453.0
sally,548.0
Edwin,412.0


In [31]:
data = [['Adrian', 20], ['Bethany', 23], ['Chloe', 41]]
df = pd.DataFrame(data, columns=["Name", "Age"])

In [32]:
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [33]:
new_dict = {
    'Name':['Tom', 'Jane', 'steve', 'lucy'],
    "Sales": [250,300,350,420]
}

df_dict = pd.DataFrame(new_dict)
df_dict

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,steve,350
3,lucy,420


In [34]:
list_dict = [
    
    {'Name': 'Tom', 'Sales': 300},
    {"Name": 'Greg'}, 
    {"Name": 'Ahmed' }
]

In [35]:
list_df = pd.DataFrame(list_dict)

In [36]:
list_df

Unnamed: 0,Name,Sales
0,Tom,300.0
1,Greg,
2,Ahmed,


In [37]:
list_df.drop(1, axis = 0)

Unnamed: 0,Name,Sales
0,Tom,300.0
2,Ahmed,


In [38]:
list_df.fillna(5)

Unnamed: 0,Name,Sales
0,Tom,300.0
1,Greg,5.0
2,Ahmed,5.0


In [39]:
df_dict

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,steve,350
3,lucy,420


In [40]:
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])


In [41]:
df_region = pd.DataFrame({'East':east, 'West': west})
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]

In [42]:
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [43]:
df_region['years'] = ['2016', '2017','2018','2019']

In [44]:
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [45]:
df_region.set_index('years')

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [46]:
df_region.set_index('years', inplace=True)

In [47]:
# now it has changed the actual variable before it was only retuening the version 
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [48]:
# reindex is to select subset of the rows.

new_region = df_region.reindex(['2018','2019','2020','2021'])

In [49]:
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [50]:
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [51]:
re_index = new_region.reindex(columns=['North', 'South', 'New'])

In [52]:
re_index


Unnamed: 0_level_0,North,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,2500.0,1500.0,
2019,4000.0,4000.0,
2020,,,
2021,,,


### Missing values 

In [53]:
re_index.fillna(0)

Unnamed: 0_level_0,North,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,2500.0,1500.0,0.0
2019,4000.0,4000.0,0.0
2020,0.0,0.0,0.0
2021,0.0,0.0,0.0


In [54]:
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [55]:
new_region.fillna(method='ffill') # copying from the one above 

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,3400.0,3500.0,4000.0,4000.0
2020,3400.0,3500.0,4000.0,4000.0
2021,3400.0,3500.0,4000.0,4000.0


In [56]:
new_region.fillna(method='') # copying from the one above 

ValueError: Invalid fill method. Expecting pad (ffill) or backfill (bfill). Got 

In [None]:
new_region.loc['2022'] = [6400,7400,5200,800]

In [None]:
new_region.interpolate()

In [None]:
new_region.mean()

In [None]:
new_region.dropna()

In [None]:
new_region.dropna(axis=1, thresh=3)

In [None]:
new_region.dropna(axis=0, how='any')

In [None]:
new_region.drop(['2020','2021'])

In [None]:
new_region.loc['2017'] = [3400,2400,2500,1500,np.nan]

In [None]:
new_region.duplicated()

In [None]:
new_region.drop_duplicates()

In [None]:
new_region.West

In [None]:
new_region['West'] # better way

In [None]:
new_region[['West', 'East']]# better way

In [57]:
new_region.loc['2022']

KeyError: '2022'

In [None]:
new_region.iloc[2]

In [None]:
new_region.iloc[2,3]

In [None]:
new_region.loc[['2019','2018']]

In [None]:
new_region[new_region <= 2000]

In [None]:
new_region["North"] + new_region['South']

In [None]:
new_region['East'].add(new_region['South'])

In [None]:
new_region


In [None]:
new_region['East'].add(new_region['South'])

In [58]:
new_region['East'].add(new_region['South'], fill_value=0)

years
2018    4900.0
2019    4000.0
2020       NaN
2021       NaN
dtype: float64

In [59]:
new_region['Total'] = new_region.drop(Total, axis=1).sum(axis=1)

NameError: name 'Total' is not defined

In [60]:
new_region.sort_values(ascending=0)

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [61]:
new_region['South'].rank(ascending=0)

years
2018    2.0
2019    1.0
2020    NaN
2021    NaN
Name: South, dtype: float64

In [62]:
new_region.describe()

Unnamed: 0,East,West,North,South
count,1.0,2.0,2.0,2.0
mean,3400.0,2950.0,3250.0,2750.0
std,,777.817459,1060.660172,1767.766953
min,3400.0,2400.0,2500.0,1500.0
25%,3400.0,2675.0,2875.0,2125.0
50%,3400.0,2950.0,3250.0,2750.0
75%,3400.0,3225.0,3625.0,3375.0
max,3400.0,3500.0,4000.0,4000.0


In [63]:
new_region.mean(axis=1)

years
2018    2450.000000
2019    3833.333333
2020            NaN
2021            NaN
dtype: float64

In [64]:
new_region.sort_index().cumsum()

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,5900.0,6500.0,5500.0
2020,,,,
2021,,,,


In [65]:
new_region.std()

East             NaN
West      777.817459
North    1060.660172
South    1767.766953
dtype: float64

In [66]:
new_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [69]:
import matplotlib as plt

In [70]:
curl -L bit.ly/sparta_iris > iris.csv

SyntaxError: invalid syntax (1949984879.py, line 1)

In [73]:
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'