In [1]:
# Install Pandas
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.0.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.9 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.9 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 60.9/60.9 kB 405.3 kB/s eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.5 MB 1.3 MB/s eta 0:00:09
    --------------------------------------- 0.2/11.5 MB 2.8 MB/s eta 0:00:05
   - 

In [1]:
# Import Pandas
import pandas as pd

## DataFrames
A DataFrame is a two-dimensional labeled data structure with columns of potentially different data types, similar to a spreadsheet or SQL table. 
It provides a powerful and flexible way to manipulate and analyze structured data in Python, offering functionalities for data analysis.

In [2]:
# Creating an Empty Data Frame
df = pd.DataFrame()

In [3]:
# Creating a dataframe using a list of lists
data = [['Alice', 23], ['Ernest', 28], ['Troy', 25]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df

Unnamed: 0,Name,Age
0,Alice,23
1,Ernest,28
2,Troy,25


In [4]:
# Creating Dataframe using a dictionary of lists
data = {
    'Name' : ['Alice', 'Ernest', 'Troy'],
    'Age' : [23, 28, 25]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,23
1,Ernest,28
2,Troy,25


In [5]:
# Creating a Dataframe using a list of dictionary
data = [{'Name': 'Alice', 'Age': 23}, {'Name': 'Ernest', 'Age': 28}, {'Name': 'Troy', 'Age': 25}]
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,23
1,Ernest,28
2,Troy,25


## Series
A pandas Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, etc.). It's similar to a one-column table or an array with associated labels, providing powerful indexing and manipulation capabilities in Python.

In [6]:
s = pd.Series([1, 3, 5, 6, 7, 8])
s

0    1
1    3
2    5
3    6
4    7
5    8
dtype: int64

### **Pandas Data Types**
Numeric:
- Integer (int64): Represents whole numbers (e.g., 10, -5). This is the default integer type in pandas.
- Float (float64): Represents numbers with decimals (e.g., 3.14, -12.5).
- Boolean (bool): Represents logical True or False values.
- Object: This is a versatile but less efficient type that can store various data types like strings, lists, or custom objects. Pandas uses this type when it cannot infer a more specific data type.

In [7]:
# Float (float64)
float_series = pd.Series([3.14, 34.1, 1.02])
float_series

0     3.14
1    34.10
2     1.02
dtype: float64

In [8]:
# Boolean (bool)
bool_series = pd.Series([True, False, True])
bool_series

0     True
1    False
2     True
dtype: bool

In [9]:
# Object (object) - mixed data types
obj_series = pd.Series([1, 3.14, 'Test1', True])
obj_series

0        1
1     3.14
2    Test1
3     True
dtype: object

Specialized Data Types:
- Datetime (datetime64[ns]): Represents dates and times with nanosecond precision. Useful for time-series data analysis.
- Timedelta (timedelta64[ns]): Represents durations between timestamps.
- Categorical: Represents categorical data with predefined categories. Efficient for storing limited sets of categories.
- Sparse: Represents sparse data with many missing values. Stores data efficiently by only keeping non-zero values.

In [11]:
# DateTime (datetime64)
pd.to_datetime('2024-06-18')

Timestamp('2024-06-18 00:00:00')

In [12]:
# Datetime Series
datetime_series = pd.Series([pd.to_datetime('2024-06-18'), pd.to_datetime('2024-12-25'), pd.to_datetime('2025-01-01')])
datetime_series

0   2024-06-18
1   2024-12-25
2   2025-01-01
dtype: datetime64[ns]

In [13]:
# Timedelta (timedelta64)
pd.Timedelta(days=8, hours=13, minutes=30)
# Timedelta Series
timedelta_series = pd.Series([pd.Timedelta(days=8, hours=13, minutes=30), pd.Timedelta(hours=13, minutes=60), pd.Timedelta(hours=13, minutes=59) ])
timedelta_series

0   8 days 13:30:00
1   0 days 14:00:00
2   0 days 13:59:00
dtype: timedelta64[ns]

In [14]:
# Categorical (category)
pd.Categorical(["Sales", "Marketing", "Operations", "HR"])
# Categorical Series
category_series = pd.Series(pd.Categorical(["Sales", "Marketing", "Operations", "HR"]))
category_series

0         Sales
1     Marketing
2    Operations
3            HR
dtype: category
Categories (4, object): ['HR', 'Marketing', 'Operations', 'Sales']

In [15]:
# Sparse (Sparse[data type]) - array with missing values
sparse_series = pd.Series(pd.arrays.SparseArray([2, pd.NA, 4, pd.NA, 6, 7]))
sparse_series

0      2
1    NaN
2      4
3    NaN
4      6
5      7
dtype: Sparse[object, nan]

# Changing Datatypes
# Step 1: Check datatype
int_series.dtype

In [17]:
# Integer (int64)
int_series = pd.Series([1, 3, 5, 6, 7, 8])
int_series

0    1
1    3
2    5
3    6
4    7
5    8
dtype: int64

In [18]:
# Changing Datatypes
# Step 1: Check datatype
int_series.dtype
# Step 2: Change the datatype (float)
int_series = int_series.astype('float')
int_series
# Changing to String
int_series = int_series.astype('string')
int_series

0    1.0
1    3.0
2    5.0
3    6.0
4    7.0
5    8.0
dtype: string

**Example: Sales Data Analysis**
You have a dataset of sales transactions that includes the product name, quantity sold, and sale price. 
You want to analyze the data to find the total revenue per product.

In [21]:
data = {
    'Product Name':['A','B','C','A','B','A'],
    'Quantity Sold':[3,2,5,4,1,2],
    'Sale Price':[10,20,10,15,20,15]
}

In [23]:
# Step 1: Create a dataframe
sales_df = pd.DataFrame(data)
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price
0,A,3,10
1,B,2,20
2,C,5,10
3,A,4,15
4,B,1,20
5,A,2,15


# Step 2: Get the Product Name column
sales_df['Product Name']

In [22]:
data = {
    'Product Name':['A','B','C','A','B','A'],
    'Quantity Sold':[3,2,5,4,1,2],
    'Sale Price':[10,20,10,15,20,15]
}


In [25]:
# Step 3: Create a new table for Total Revenue for each row
sales_df['Total Revenue'] = sales_df['Quantity Sold'] * sales_df['Sale Price']
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60
4,B,1,20,20
5,A,2,15,30


In [26]:
# Step 4: Group column values (groupby)
total_revenue = sales_df.groupby('Product Name')['Total Revenue'].sum()
total_revenue

Product Name
A    120
B     60
C     50
Name: Total Revenue, dtype: int64

In [27]:
sales_df['Total Revenue'] = sales_df['Quantity Sold'] * sales_df['Sale Price']
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60
4,B,1,20,20
5,A,2,15,30


In [29]:
# Step 5: Create a new dataframe to show the data in tabular format
results_df = pd.DataFrame()
results_df['Total Revenue'] = sales_df.groupby('Product Name')['Total Revenue'].sum()
results_df

Unnamed: 0_level_0,Total Revenue
Product Name,Unnamed: 1_level_1
A,120
B,60
C,50


### **Data Selection**
Pandas provides numerous methods for selecting and indexing data in Series and DataFrames, including label-based indexing with .loc, integer-position based indexing with .iloc, and conditional selection.

In [30]:
# Check the First Two Rows of Product Names
# [start:end] - similar to slicing of list
sales_df['Product Name'][0:2]

0    A
1    B
Name: Product Name, dtype: object

In [31]:
# Check the custom amount of rows for Quantity Sold
sales_df['Quantity Sold'][2:5]

2    5
3    4
4    1
Name: Quantity Sold, dtype: int64

In [32]:
# Check in twos
sales_df['Sale Price'][::2]

0    10
2    10
4    20
Name: Sale Price, dtype: int64

"""
Index Location (.iloc)
- Will get rows based on a number/index.
- Will output into a DataFrame instead of a Series.
"""

In [33]:
# Turn the first 3 rows into a new DataFrame
sales_df.iloc[0:3]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50


"""
Location (.loc)
- Access a group of rows and columns by label(s) or a boolean array.
"""

In [34]:
"""
Location (.loc)
- Access a group of rows and columns by label(s) or a boolean array.
"""
# Get only specific columns (Product Name and Sale Price)
sales_df.loc[0:3, ['Product Name', 'Sale Price']]

Unnamed: 0,Product Name,Sale Price
0,A,10
1,B,20
2,C,10
3,A,15


.iloc = using indexes or numbers
.loc = accessing via labels

They look similar right now because 0, 1, 2 and 3 are the labels for the rows 😆

If the row labels are changed, we can call it by those instead.

It explains why: iloc[0:3] and loc[0:3] are different.

iloc[0:3] = get the first 3 rows
loc[0:3] = get the rows from label "0" to label "3"

Oh…The first column of numbers are called labels! Do the labels always have to integers that looks like index?By default, they are labeled this way as it represents the order of rows in the dataframe.

If you want to change their labels...df.index = ['Row_1', 'Row_2', 'Row_3', 'Row_4']

In [36]:
# Conditional Filtering 
# Check for Total Revenues that are greater than or equal to 40
sales_df[sales_df['Total Revenue'] >= 40]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60


In [37]:
# Check for Products that are eqaul to A
sales_df[sales_df['Product Name'] == 'A']

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
3,A,4,15,60
5,A,2,15,30


Can we also filter strings using something similar to SQL ‘LIKE’ keywords instead of exact match?
you can try .contains!
i.e. sales_df[sales_df['Product Name'].contains("A")]

In [38]:
# Check for Products that are eqaul to A
#dataframe[dataframe[column name from the datframe] operator 'condition']
sales_df[sales_df['Product Name'] == 'A']

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
3,A,4,15,60
5,A,2,15,30


Pandas == Python
SQL is it's own language

Maybe, we can associate it more with Python programming, so it's easier to understand? 🤔**Example: Filtering Customer Reviews**
A DataFrame contains customer reviews for different products, including a numeric rating. You need to filter reviews to find all reviews of a specific product with a rating of 4 or higher.

In [40]:
reviews_data = {
    'ProductID': ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10'],
    'Rating': [5,3,2,1,4,3,2,4,6,1]
}

Pandas == Python
SQL is it's own language

Maybe, we can associate it more with Python programming, so it's easier to understand? 🤔
Purpose for the 2 are different.
Python is a programming language.  Pandas is python module for data manipulation from different data sources (database, API results in json format, etc).  SQL is a query language for database (data storage).

In [41]:
# Step 1: Create a dataframe
reviews_df = pd.DataFrame(reviews_data)
reviews_df

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,1
4,P5,4
5,P6,3
6,P7,2
7,P8,4
8,P9,6
9,P10,1


In [42]:
# Step 2: Ratings of Prodcuts that are 4 or higher
reviews_df[reviews_df['Rating'] >= 4]

Unnamed: 0,ProductID,Rating
0,P1,5
4,P5,4
7,P8,4
8,P9,6


## Pandas Operators
Data Loading and Exploration:
- head(): Shows the first few rows of a DataFrame
- tail(): Shows the last few rows of a DataFrame
- describe(): Generates summary statistics for each column (mean, standard deviation, etc.)
- info(): Displays information about the DataFrame, including data types and memory usage
Data Analysis:
- sum(): Calculates the sum of a Series or DataFrame
- mean(): Calculates the mean of a Series or DataFrame
- median(): Calculates the median of a Series or DataFrame
- std(): Calculates the standard deviation of a Series or DataFrame
- var(): Calculates the variance of a Series or DataFrame

In [43]:
# Recalling the Reviews DataFrame
reviews_df

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,1
4,P5,4
5,P6,3
6,P7,2
7,P8,4
8,P9,6
9,P10,1


In [44]:
# Recalling the Reviews DataFrame
reviews_df

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,1
4,P5,4
5,P6,3
6,P7,2
7,P8,4
8,P9,6
9,P10,1


In [45]:
# head(n)
# first 3 rows
reviews_df.head(3)
# default - gets the first 5 rows
reviews_df.head()

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,1
4,P5,4


In [46]:
# tail(n)
reviews_df.tail(4)

Unnamed: 0,ProductID,Rating
6,P7,2
7,P8,4
8,P9,6
9,P10,1


# describe() - count, std, min, max, percentiles
# low STD - data are close to the mean
# high STD - data are far to the mean


In [47]:
reviews_df.describe()


Unnamed: 0,Rating
count,10.0
mean,3.1
std,1.66333
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,6.0


reviews_df.describe(percentiles=[.1, .5, .9])

In [48]:
reviews_df.describe(percentiles=[.1, .5, .9])

Unnamed: 0,Rating
count,10.0
mean,3.1
std,1.66333
min,1.0
10%,1.0
50%,3.0
90%,5.1
max,6.0


In [49]:
# info()
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ProductID  10 non-null     object
 1   Rating     10 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 292.0+ bytes


In [50]:
# Mean
reviews_df['Rating'].mean()

np.float64(3.1)

In [51]:
# sum
reviews_df['Rating'].sum()

np.int64(31)

In [52]:
# Median
reviews_df['Rating'].median()

np.float64(3.0)

In [53]:
# Standard Deviation ( how far apart our ratings are from the mean )
reviews_df['Rating'].std()

np.float64(1.66332999331662)

In [54]:
# Variance (This value indicate how much the data points in average, deviate from the mean squared)
reviews_df['Rating'].var()

np.float64(2.766666666666667)