# Ramadan Abdunabi
## Week13: 04/15/2025 - 04/17/2025
### Python - Pandas Series and DataFrame

## What is Pandas?

- Pandas is a open source Python library used for working with data sets.
- Used for analyzing, cleaning, exploring, and manipulating data.
- The name "Pandas" has a reference to both "Panel Data", and **"Python Data Analysis"** and was created by Wes McKinney in 2008.

- Pandas allows us to analyze big data and make conclusions based on statistical theories.


In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
pd.__version__

'1.0.5'

In [3]:
# Panda Series
# A Pandas Series is like a column in a table.
# It is a one-dimensional array holding data of any type.

s = Series([2,5,-6,7,8,1])
s

0    2
1    5
2   -6
3    7
4    8
5    1
dtype: int64

In [4]:
s = pd.Series([2,5,-6,7,8,1])
s

0    2
1    5
2   -6
3    7
4    8
5    1
dtype: int64

In [5]:
type(s)

pandas.core.series.Series

In [6]:
s.values

array([ 2,  5, -6,  7,  8,  1], dtype=int64)

In [7]:
s.index

RangeIndex(start=0, stop=6, step=1)

In [8]:
# Labels
s = Series([10,20,5,-12,9], index=['w','x','y','z','u'])
s

w    10
x    20
y     5
z   -12
u     9
dtype: int64

In [9]:
s['x']

20

In [10]:
s[0]

10

In [11]:
s[:2]

w    10
x    20
dtype: int64

In [12]:
s = Series(np.arange(6) * 10, index = list("abcdeo"))
s

a     0
b    10
c    20
d    30
e    40
o    50
dtype: int32

In [13]:
# values and index Series
summer_olympic = Series([2520,1122,937,847,713], index=['USA','Russia','Germany','UK','France'] )

In [14]:
summer_olympic

USA        2520
Russia     1122
Germany     937
UK          847
France      713
dtype: int64

In [15]:
summer_olympic['USA']

2520

In [16]:
summer_olympic[summer_olympic > 800]

USA        2520
Russia     1122
Germany     937
UK          847
dtype: int64

In [17]:
summer_olympic > 800

USA         True
Russia      True
Germany     True
UK          True
France     False
dtype: bool

In [18]:
'USA' in summer_olympic

True

In [19]:
# Convert Series to dictionary - to dict()
d = summer_olympic.to_dict()
d

{'USA': 2520, 'Russia': 1122, 'Germany': 937, 'UK': 847, 'France': 713}

In [20]:
#Convert dictionary to Series
s2 = Series(d)
s2

USA        2520
Russia     1122
Germany     937
UK          847
France      713
dtype: int64

In [21]:
countries = ['Russia','France','Spain','Germany','USA','UK','Italy']
countries

['Russia', 'France', 'Spain', 'Germany', 'USA', 'UK', 'Italy']

In [22]:
d

{'USA': 2520, 'Russia': 1122, 'Germany': 937, 'UK': 847, 'France': 713}

In [23]:
s3 = Series(d, index=countries)
s3

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [24]:
pd.isnull(s3)

Russia     False
France     False
Spain       True
Germany    False
USA        False
UK         False
Italy       True
dtype: bool

In [25]:
pd.notnull(s3)

Russia      True
France      True
Spain      False
Germany     True
USA         True
UK          True
Italy      False
dtype: bool

In [26]:
s2

USA        2520
Russia     1122
Germany     937
UK          847
France      713
dtype: int64

In [27]:
s3

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [28]:
s2 + s3

France     1426.0
Germany    1874.0
Italy         NaN
Russia     2244.0
Spain         NaN
UK         1694.0
USA        5040.0
dtype: float64

In [29]:
s3

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [30]:
s3.index.name='Countries'

In [31]:
s3

Countries
Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [32]:
s3.name= "Summer Olympic"

In [33]:
s3

Countries
Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
Name: Summer Olympic, dtype: float64

In [34]:
# What is a DataFrame?
# DataFrame is a 2 dimensional array
d = {
    'cars': ["GMC", "Toyota", "Ford"],
    'seats': [8, 7, 5]
    
}
d

{'cars': ['GMC', 'Toyota', 'Ford'], 'seats': [8, 7, 5]}

In [35]:
df = DataFrame(d)
df

Unnamed: 0,cars,seats
0,GMC,8
1,Toyota,7
2,Ford,5


In [36]:
df.info

<bound method DataFrame.info of      cars  seats
0     GMC      8
1  Toyota      7
2    Ford      5>

In [37]:
df.describe

<bound method NDFrame.describe of      cars  seats
0     GMC      8
1  Toyota      7
2    Ford      5>

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cars    3 non-null      object
 1   seats   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [39]:
df.describe()

Unnamed: 0,seats
count,3.0
mean,6.666667
std,1.527525
min,5.0
25%,6.0
50%,7.0
75%,7.5
max,8.0


In [40]:
df['cars']

0       GMC
1    Toyota
2      Ford
Name: cars, dtype: object

In [41]:
type(df['cars'])

pandas.core.series.Series

In [42]:
df

Unnamed: 0,cars,seats
0,GMC,8
1,Toyota,7
2,Ford,5


In [43]:
df['seats']

0    8
1    7
2    5
Name: seats, dtype: int64

In [44]:
# Locate Row
df.loc[0]

cars     GMC
seats      8
Name: 0, dtype: object

In [45]:
df.iloc[0]

cars     GMC
seats      8
Name: 0, dtype: object

In [46]:
df.loc[[0,1]]

Unnamed: 0,cars,seats
0,GMC,8
1,Toyota,7


In [47]:
df.loc[[0,2]]

Unnamed: 0,cars,seats
0,GMC,8
2,Ford,5


In [48]:
type(df.loc[[0,2]])

pandas.core.frame.DataFrame

In [49]:
d2 = { 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
         'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
df2 = DataFrame(d2)

In [50]:
d2

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [51]:
df2

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [52]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   state   6 non-null      object 
 1   year    6 non-null      int64  
 2   pop     6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes


In [53]:
df2.describe()

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [54]:
df2.head(2)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7


In [55]:
df2.tail(3)

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [56]:
df2

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [57]:
# reorder the columns
df3 = DataFrame(d2, columns=['year','state','pop'])
df3

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [58]:
df3 = DataFrame(d2, columns=['year','state','pop'],
                    index=['one', 'two', 'three', 'four','five', 'six'])
df3

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [59]:
df3.columns

Index(['year', 'state', 'pop'], dtype='object')

In [60]:
df3.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [61]:
df3['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [62]:
df3[['state','pop']]

Unnamed: 0,state,pop
one,Ohio,1.5
two,Ohio,1.7
three,Ohio,3.6
four,Nevada,2.4
five,Nevada,2.9
six,Nevada,3.2


In [63]:
df3[['pop', 'state']]

Unnamed: 0,pop,state
one,1.5,Ohio
two,1.7,Ohio
three,3.6,Ohio
four,2.4,Nevada
five,2.9,Nevada
six,3.2,Nevada


In [64]:
df3.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [65]:
df3.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [66]:
df3

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9
six,2003,Nevada,3.2


In [67]:
df3.loc['three']

year     2002
state    Ohio
pop       3.6
Name: three, dtype: object

In [68]:
type(df3.loc['three'])

pandas.core.series.Series

In [69]:
df3.loc[['three','five']]

Unnamed: 0,year,state,pop
three,2002,Ohio,3.6
five,2002,Nevada,2.9


In [70]:
type(df3.loc[['three','five']])

pandas.core.frame.DataFrame

In [71]:
df3['debt'] = 10

In [72]:
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,10
two,2001,Ohio,1.7,10
three,2002,Ohio,3.6,10
four,2001,Nevada,2.4,10
five,2002,Nevada,2.9,10
six,2003,Nevada,3.2,10


In [73]:
len(df3['debt'])

6

In [74]:
np.rint(len(df3['debt']))

6.0

In [75]:
df3['debt'] = np.arange(len(df3['debt']))
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [76]:
df3['debt'] = np.random.random(len(df3['debt'])) * 10
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,4.71354
two,2001,Ohio,1.7,3.896107
three,2002,Ohio,3.6,6.308876
four,2001,Nevada,2.4,8.938152
five,2002,Nevada,2.9,9.670554
six,2003,Nevada,3.2,5.687088


In [77]:
np.random.random(len(df3['debt']))

array([0.29606758, 0.97341496, 0.52631406, 0.596847  , 0.15563869,
       0.55140582])

In [78]:
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,4.71354
two,2001,Ohio,1.7,3.896107
three,2002,Ohio,3.6,6.308876
four,2001,Nevada,2.4,8.938152
five,2002,Nevada,2.9,9.670554
six,2003,Nevada,3.2,5.687088


In [79]:
df3.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [81]:
df3.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [82]:
# Assign a series values to one of the data frame columns
s = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
s

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [83]:
df3['debt'] = s

In [84]:
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [85]:
df3['eastren'] = df3['state'] == 'Ohio'
df3

Unnamed: 0,year,state,pop,debt,eastren
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [86]:
del df3['eastren']

In [87]:
df3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [88]:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'San Francisco', 'Los Angeles']}
data

{'Name': ['Alice', 'Bob', 'Charlie'],
 'Age': [25, 30, 35],
 'City': ['New York', 'San Francisco', 'Los Angeles']}

In [89]:
df = DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Los Angeles


In [90]:
df['Age']

0    25
1    30
2    35
Name: Age, dtype: int64

In [93]:
df.loc[0]

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [94]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [96]:
df.loc[[0,2]]

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Los Angeles


In [97]:
df[df['Age'] > 25]

Unnamed: 0,Name,Age,City
1,Bob,30,San Francisco
2,Charlie,35,Los Angeles


In [98]:
df['Age'] > 25

0    False
1     True
2     True
Name: Age, dtype: bool

In [99]:
# Nested Dictionary
data = {
    'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
}
data

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [100]:
df = DataFrame(data)
df

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [101]:
df.index

Int64Index([2001, 2002, 2000], dtype='int64')

In [102]:
df.columns

Index(['Nevada', 'Ohio'], dtype='object')

In [103]:
df

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [104]:
df.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [105]:
df

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [107]:
df1 = DataFrame(df, index=[2001,2002,2003])
df1

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [108]:
df

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [109]:
df['Ohio']

2001    1.7
2002    3.6
2000    1.5
Name: Ohio, dtype: float64

In [110]:
df['Ohio'][:2]

2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [111]:
df[['Ohio', 'Nevada']][:2]

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [112]:
#  Set index and columns their name attributes
df.index.name= 'year'
df.columns.name = 'state'
df

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [113]:
df.describe()

state,Nevada,Ohio
count,2.0,3.0
mean,2.65,2.266667
std,0.353553,1.159023
min,2.4,1.5
25%,2.525,1.6
50%,2.65,1.7
75%,2.775,2.65
max,2.9,3.6


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 2001 to 2000
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Nevada  2 non-null      float64
 1   Ohio    3 non-null      float64
dtypes: float64(2)
memory usage: 152.0 bytes
