In [24]:
# Data Visualization
# BTech Computer Science Stream , January 2025
# Week 4 - Pandas Demonstration Notebook
# Sujithra,  Reg Number , Date: 06/01/2025

In [25]:
import numpy as np
import pandas as pd

# Why Pandas?
1. Pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convenient in Python. 
2. Pandas is often used with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib. 
3. Pandas adopts significant parts of NumPy's idiomatic style of array-based computing, especially array-based functions and a preference for data processing without for loops.
4. Pandas is designed for working with tabular or heterogeneous data. NumPy, by contrast, is best suited for working with homogeneously typed numerical array data.
5. Important data structures as Series and Dataframe

In [26]:
# A Series is a one-dimensional array-like object containing a sequence of values  of the same type 
# and an associated array of data labels, called its index.
ser_x=pd.Series([4,7,-5,-3])
ser_x

0    4
1    7
2   -5
3   -3
dtype: int64

In [27]:
#ser_x.array
ser_x.index

RangeIndex(start=0, stop=4, step=1)

In [28]:
#Series with an index identifying each data point with a labelcan be created
ser_y= pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
ser_y.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [29]:
# Labels in the index can be used when selecting single values or a set of values: 
ser_y["d"] = 6
ser_y[["c", "a", "d"]]

c    3
a   -5
d    6
dtype: int64

In [30]:
# Can use NumPy-like operations, such as filtering with a Boolean array, scalar multiplication, or applying math functions
# The index-value link will be preserved
#ser_y[ser_y > 0]
#ser_y * 2
#np.exp(ser_y)
"b" in ser_y

True

In [31]:
#Series can be thought og as a fixed-length, ordered dictionary, as it is a mapping of index values to data values. 
# Dictionaries can be converted to series too, for example , creating a dictionary of state and population.
dict_x = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
ser_cities = pd.Series(dict_x)
ser_cities

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
#The dictionary keys can be overriden by passing an index with the dictionary keys in the order we want
states = ["California", "Ohio", "Oregon", "Texas"]
ser_z = pd.Series(dict_x, index=states)
ser_z

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [33]:
# Values found in sdata were placed in the appropriate locations,since no value for "California" was found, it appears as NaN 
# (Not a Number), which is considered in pandas to mark missing or NA values. 
# Since "Utah" was not included in states, it is excluded from the resulting object.
# “missing,” “NA,” or “null” can be used interchangeably to refer to missing data. 
# The isna and notna functions in pandas should be used to detect missing data:
#pd.isna(ser_z)
pd.notna(ser_z)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [34]:
# A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
ser_cities + ser_z

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [35]:
# A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns
# each of which can be a different value type (numeric, string, Boolean, etc.). 
# The DataFrame has both a row and column index; it can be thought of as a dictionary of Series all sharing the same index.
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
data_x = pd.DataFrame(data)
data_x

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [36]:
# In Jupyter notebook, pandas DataFrame objects will be displayed as a more browser-friendly HTML table.
# Head and tail to display first and last 5 rows.
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:
#data_x
#data_x.head()
data_x.tail()
#pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [37]:
# Columns of data frames can be retrieved using :
#data_x["state"]
data_x.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [38]:
# data can be retrieved using loc or iloc. The difference is that 
#data_x.loc[1]
data_x.iloc[2]

state    Ohio
year     2002
pop       3.6
Name: 2, dtype: object

In [39]:
# Columns can be added to data frame, columns can be deleted using del
#data_x["debt"] = 16.5
#data_x
#data_x["eastern"] = data_x["state"] == "Ohio"
#del data_x["eastern"]
data_x


Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [40]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
#frame2["debt"] = val
#frame2

In [41]:
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

NameError: name 'frame2' is not defined

In [None]:
del frame2["eastern"]
frame2.columns

In [None]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}

In [None]:
frame3 = pd.DataFrame(populations)
frame3

In [None]:
frame3.T

In [None]:
pd.DataFrame(populations, index=[2001, 2002, 2003])

In [None]:
pdata = {"Ohio": frame3["Ohio"][:-1],
         "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

In [None]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

In [None]:
frame3.to_numpy()

In [None]:
frame2.to_numpy()

In [None]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index
index[1:]

In [None]:
labels = pd.Index(np.arange(3))
labels
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
obj2.index is labels

In [None]:
frame3
frame3.columns
"Ohio" in frame3.columns
2003 in frame3.index

In [None]:
pd.Index(["foo", "foo", "bar", "bar"])

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

In [None]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

In [None]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3
obj3.reindex(np.arange(6), method="ffill")

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=["a", "c", "d"],
                     columns=["Ohio", "Texas", "California"])
frame
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

In [None]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

In [None]:
frame.reindex(states, axis="columns")

In [None]:
frame.loc[["a", "d", "c"], ["California", "Texas"]]

In [None]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj
new_obj = obj.drop("c")
new_obj
obj.drop(["d", "c"])

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

In [None]:
data.drop(index=["Colorado", "Ohio"])

In [None]:
data.drop(columns=["two"])

In [None]:
data.drop("two", axis=1)
data.drop(["two", "four"], axis="columns")

In [None]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj
obj["b"]
obj[1]
obj[2:4]
obj[["b", "a", "d"]]
obj[[1, 3]]
obj[obj < 2]

In [None]:
obj.loc[["b", "a", "d"]]

In [None]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])
obj1
obj2
obj1[[0, 1, 2]]
obj2[[0, 1, 2]]

In [None]:
obj1.iloc[[0, 1, 2]]
obj2.iloc[[0, 1, 2]]

In [None]:
obj2.loc["b":"c"]

In [None]:
obj2.loc["b":"c"] = 5
obj2

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data
data["two"]
data[["three", "one"]]

In [None]:
data[:2]
data[data["three"] > 5]

In [None]:
data < 5

In [None]:
data[data < 5] = 0
data

In [None]:
data
data.loc["Colorado"]

In [None]:
data.loc[["Colorado", "New York"]]

In [None]:
data.loc["Colorado", ["two", "three"]]

In [None]:
data.iloc[2]
data.iloc[[2, 1]]
data.iloc[2, [3, 0, 1]]
data.iloc[[1, 2], [3, 0, 1]]

In [None]:
data.loc[:"Utah", "two"]
data.iloc[:, :3][data.three > 5]

In [None]:
data.loc[data.three >= 2]

In [None]:
ser = pd.Series(np.arange(3.))
ser
ser[-1]

In [None]:
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=["a", "b", "c"])
ser2[-1]

In [None]:
ser.iloc[-1]

In [None]:
ser[:2]

In [None]:
data.loc[:, "one"] = 1
data
data.iloc[2] = 5
data
data.loc[data["four"] > 5] = 3
data

In [None]:
data.loc[data.three == 5]["three"] = 6

In [None]:
data

In [None]:
data.loc[data.three == 5, "three"] = 6
data

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=["a", "c", "e", "f", "g"])
s1
s2

In [None]:
s1 + s2

In [43]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),
                   index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])
df1
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [44]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [45]:
df1 = pd.DataFrame({"A": [1, 2]})
df2 = pd.DataFrame({"B": [3, 4]})
df1
df2
df1 + df2

Unnamed: 0,A,B
0,,
1,,


In [46]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list("abcde"))
df2.loc[1, "b"] = np.nan
df1
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
df1 + df2

In [47]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [48]:
1 / df1
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

In [None]:
arr = np.arange(12.).reshape((3, 4))
arr
arr[0]
arr - arr[0]

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
series = frame.iloc[0]
frame
series

In [None]:
frame - series

In [None]:
series2 = pd.Series(np.arange(3), index=["b", "e", "f"])
series2
frame + series2

In [None]:
series3 = frame["d"]
frame
series3
frame.sub(series3, axis="index")

In [None]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
frame
np.abs(frame)

In [None]:
def f1(x):
    return x.max() - x.min()

frame.apply(f1)

In [None]:
frame.apply(f1, axis="columns")

In [None]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])
frame.apply(f2)

In [None]:
def my_format(x):
    return f"{x:.2f}"

frame.applymap(my_format)

In [None]:
frame["e"].map(my_format)

In [None]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj
obj.sort_index()

In [None]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                     columns=["d", "a", "b", "c"])
frame
frame.sort_index()
frame.sort_index(axis="columns")

In [None]:
frame.sort_index(axis="columns", ascending=False)

In [None]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

In [None]:
obj.sort_values(na_position="first")

In [None]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame
frame.sort_values("b")

In [None]:
frame.sort_values(["a", "b"])

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

In [None]:
obj.rank(method="first")

In [None]:
obj.rank(ascending=False)

In [None]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],
                      "c": [-2, 5, 8, -2.5]})
frame
frame.rank(axis="columns")

In [None]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

In [None]:
obj.index.is_unique

In [None]:
obj["a"]
obj["c"]

In [None]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=["a", "a", "b", "b", "c"])
df
df.loc["b"]
df.loc["c"]

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

In [None]:
df.sum()

In [None]:
df.sum(axis="columns")

In [None]:
df.sum(axis="index", skipna=False)
df.sum(axis="columns", skipna=False)

In [None]:
df.mean(axis="columns")

In [None]:
df.idxmax()

In [None]:
df.cumsum()

In [None]:
df.describe()

In [None]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj.describe()

In [None]:
price = pd.read_pickle("examples/yahoo_price.pkl")
volume = pd.read_pickle("examples/yahoo_volume.pkl")

In [None]:
returns = price.pct_change()
returns.tail()

In [None]:
returns["MSFT"].corr(returns["IBM"])
returns["MSFT"].cov(returns["IBM"])

In [None]:
returns.corr()
returns.cov()

In [None]:
returns.corrwith(returns["IBM"])

In [None]:
returns.corrwith(volume)

In [None]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [None]:
uniques = obj.unique()
uniques

In [None]:
obj.value_counts()

In [None]:
pd.value_counts(obj.to_numpy(), sort=False)

In [None]:
obj
mask = obj.isin(["b", "c"])
mask
obj[mask]

In [None]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
unique_vals = pd.Series(["c", "b", "a"])
indices = pd.Index(unique_vals).get_indexer(to_match)
indices

In [None]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],
                     "Qu2": [2, 3, 1, 2, 3],
                     "Qu3": [1, 5, 2, 4, 4]})
data

In [None]:
data["Qu1"].value_counts().sort_index()

In [None]:
result = data.apply(pd.value_counts).fillna(0)
result

In [None]:
data = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0, 0]})
data
data.value_counts()

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS