<a href="https://colab.research.google.com/github/Chood16/DSCI222/blob/main/lectures/(4)_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas

* Open-source Python library designed for working with relational or labeled data in an easy and intuitive way.

* Built on top of NumPy, reusing and extending many of its data structures.

* Provides powerful data structures (like DataFrame and Series) for manipulating numerical data and time series.

* Offers high performance and productivity for data analysis tasks.

* Commonly used in data science workflows alongside:

  * Matplotlib for data visualization

  * SciPy for statistical analysis

  * Scikit-learn for machine learning

* Enables seamless data handling, making it easier to clean, transform, and analyze datasets.

In [None]:
# To import libraries
import numpy as np
import pandas as pd

# Series

In [None]:
# To create Series
s = pd.Series([-5, 1.3, 21, 6, 3],
              index = ['a', 'b', 'c', 'd', 'e'])
print(s)
display(s)

In [None]:
# You can call an element by their index
print(s['a'])

s['a'] = 5
s


In [None]:
# Don't forget to use copy!!!

s2 = s
s2['a'] = -5
display('s series updates as well:', s)

s2 = s.copy()
s2['a'] = 1111
display("Now that s2 was made as a copy, s won't change", s)

In [None]:
# Conver to Numpy object
s.to_numpy()

In [None]:
# How to GET specific elements (e.g., 'e', 'd', 'c')?
s[["b", "d", "c"]]

In [None]:
# How to show all numbers above the mean?
s[s > s.mean()]

In [None]:
# Can we perform operations (+, **, etc) on Series?
s1 = pd.Series(range(4), index = ["A", "B", "C", "D"], name = 'Series one')
display( s1 )
s2 = pd.Series(data = range(10, 14), index = ["B", "C", "D", "E"])
display( s2 )

In [None]:
#        ¿?
s1 + s2

In [None]:
# How to show the type of each instance in the Series ?
x = pd.Series(['A', 1, ["I", "AM", "A", "LIST"]])
x.map(type)

In [None]:
# Even more ...
x1 = pd.Series([1, 2, 3])
print(f"x1 dtype: {x1.dtype}")
print(f"x1 memory usage: {x1.memory_usage(deep=True)} bytes")
print("")
x2 = pd.Series([1, 2, "3"])
print(f"x2 dtype: {x2.dtype}")
print(f"x2 memory usage: {x2.memory_usage(deep=True)} bytes")
print("")
# Coerce the object series to int8
x3 = pd.Series([1, 2, "3"]).astype('int8')
print(f"x3 dtype: {x3.dtype}")
print(f"x3 memory usage: {x3.memory_usage(deep=True)} bytes")

# Dataframes

In [None]:
# Test the next 5 different ways to create a DataFrame

In [None]:
# Dictionary <-- Most common for df
df = pd.DataFrame({"Name": ['Tom', 'Mike', 'Tiffany'], "Age": [7, 15, 3]})
display (df)

# List of lists <-- Very common as well
df = pd.DataFrame([['Tom', 7], ['Mike', 15], ['Tiffany', 3]], columns=['Name', 'Age'])
display (df)

# ndarray
df = pd.DataFrame(np.array([['Tom', 7], ['Mike', 15], ['Tiffany', 3]]), columns=['Name', 'Age'])
display (df)

# List of  Tuples
df = pd.DataFrame(zip(['Tom', 'Mike', 'Tiffany'], [7, 15, 3]), columns=['Name', 'Age'])
display (df)

# Series
df = pd.DataFrame({"Name": pd.Series(['Tom', 'Mike', 'Tiffany']), "Age": pd.Series([7, 15, 3])})
display (df)

In [None]:
# It includes Column names and the Index
df = pd.DataFrame({"Name": ['Tom', 'Mike', 'Tiffany'],
                   "Language": ['R', 'Python', 'Python'],
                   "Courses": [7, 15, 3]},
                  index = [777, 888, 999] )

display( df )

In [None]:
# It's always helpful to check the data types
df.map(type)

In [None]:
# let's make them all strings
df = df.astype({'Name': 'string', 'Courses': 'string'})
df.map(type)

In [None]:
# Show all observations/instances where their
# number of courses is greater than 5
# df[df['Courses'] > 5]
df[df['Courses'].astype('int16') > 5]

In [None]:
# Add the next row:
# Kate, Golang, 6, 4.0
# with index: 111

new_row = pd.DataFrame([{'Name':'Kate', 'Language': 'Golang', 'Courses': 6}], index = [111])
df = pd.concat([df, new_row])
display(df)

In [None]:
# There's always more than one way to do it
new_row = {'Name':'Zefran', 'Language': 'Python', 'Courses': 5}
df.loc[len(df)] = new_row
display( df )

# To change the index value
df.rename( index = {df.index[-1] : 222}, inplace = True )
display( df )

In [None]:
# Insert a column 'Credits' where Credits = Courses * 3

# It doesn't work as expected!!

df.assign(Credits = lambda x: x.Courses * 3)


In [None]:
df = df.astype({'Courses': 'int16'})
df.map(type)

In [None]:
# Attempt 2

df = df.assign(Credits = lambda x: x.Courses * 3)
display(df)

# Let's remove the column and try a different way
df = df.drop(columns='Credits')
display(df)

#Attempt 3
df['Credits'] = df['Courses'] * 3
display(df)

In [None]:
# Show all observations/instances where their
# number of courses is greater than 4
# when the language is Python
df[(df['Courses'] > 4) & (df['Language'] == 'Python')]


In [None]:
# What if we want just the names of these students?

df[(df['Courses'] > 4) & (df['Language'] == 'Python')]['Name']

In [None]:
# Create a dataset, fill out with random integer values [-7, 13),
# with 270 rows and 50 columns.
# The column names will follow the format:
# Col_1, Col_2, ... , Col_50
# The index values will follow the format:
# 2029-01-01, 2029-01-02, ..., 2029-09-27

# Random data
rnd_data = np.random.randint(-7, 13, (270, 50))
print(rnd_data.shape)

# Column names using list comprehension
dt_col = ['Col_'+str(i) for i in range(1,rnd_data.shape[1]+1)]


# Index
dt_idx = pd.date_range(start='2029-01-01', periods=rnd_data.shape[0], freq='D')
# dt_idx = pd.date_range(start='2029-01-01', end='2029-09-27', freq='D')

df_rnd = pd.DataFrame(rnd_data, index = dt_idx, columns = dt_col)
display( df_rnd )
display( df_rnd.head())

In [None]:
# Sort DataFrame by the values in Column 3 using .sort_values
df_rnd.sort_values(by = 'Col_3', ascending = False).head()

In [None]:
# Create a copy of the df
# Replace all negative numbers  by 'Not a Number' (np.nan)
# Replace all numbers > 10  by 'Infinite' (np.inf)

df_cpy = df_rnd.copy()

# Nan
df_cpy[df_cpy < 0] = np.nan

# Inf
df_cpy[df_cpy > 10] = np.inf
display(df_cpy.head())

In [None]:
# Verify the data type of the original dataframe
# Verify the data type of the new dataframe

display(df_rnd.dtypes.head())
display(df_cpy.dtypes.head())

In [None]:
# Create histogram of the distribution of Nan values counted by column
import matplotlib.pyplot as plt
a = df_cpy.isna().sum() # To compute NaNs by Column
plt.hist(a, bins=16)
plt.show()