# Extensions:
- Jupyter (Microsoft).

# Libraries:
Install Automatically:
```bash
chmod +x ./install.sh && ./install.sh
```

Install Manually:
```
pip install ipykernel  
pip install pandas
```

#### NumPy vs Pandas:
- The difference between NumPy and Pandas is that NumPy is a package for scientific computing in Python, while Pandas is a package for data analysis in Python.

#### NumPy Arrays vs Pandas DataFrames:
- NumPy Arrays are used to store a collection of elements of the same data type, while Pandas DataFrames are used to store rows and columns of heterogeneous data.

#### Pandas DataFrames vs Pandas Series:
- Pandas Series are used to store a column of data, while Pandas DataFrames are used to store rows and columns of data.
- Pandas Series are similar to NumPy arrays, but Pandas Series can have axis labels, which means it can be indexed by a label, instead of just a number location.

In [5]:
# Import Pandas and Numpy:

import pandas as pd
import numpy as np

In [None]:
# Create a Pandas DataFrame:

df = pd.DataFrame({'integers':[1, 2, 3, 4], 'floats':[1.00, 2.00, 3.00, 4.00], 'strings':['Breno', 'Amanda','Manoel','Marcia']})
display(df)

In [None]:
# Create a Pandas DataFrame from a JSON file:

# save the df to a json file:
df.to_json('data.json')

# read the json file:
json_df = pd.read_json('data.json')
display(json_df)

In [None]:
# Create a Pandas DataFrame of dates:

dates = pd.date_range('20190101', periods=10)
display(dates)

In [None]:
# Create a Pandas DataFrame from a CSV file:

df_csv = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_apple_stock.csv')
display(df_csv)

In [None]:
df_csv.head() # first 5 rows
# df_csv.tail() # last 5 rows
# df_csv.describe() # summary statistics
# df_csv.info() # data types
# df_csv.sample(5) # random sample of 5 rows


In [None]:
# Methods of a Pandas DataFrame:

df_csv["AAPL_y"].plot() # plot a column

display(f"df_csv['AAPL_y'].max(): {df_csv['AAPL_y'].max()}") # max value of a column
display(f"df_csv['AAPL_y'].idxmax(): {df_csv['AAPL_y'].idxmax()}") # index of max value of a column
max_index = df_csv['AAPL_y'].idxmax()
display(f"df_csv.loc[max_index]: {df_csv.loc[max_index]}") # row of max value of a column

In [None]:
# Drop a column:
df_csv.drop(columns=['AAPL_x'], inplace=True)
display(df_csv)

In [None]:
# Create a Pandas DataFrame from a excel file (xlsx):

# Reada the excel file named "name.xlsx" in the tab named "sheet1":
df_xlsx = pd.read_excel("name.xlsx")

# Openpyxl is a Python library to read/write Excel 2010 xlsx/xlsm/xltx/xltm files.
# LXLS is a Python library to read/write Excel 2007 xlsx/xlsm files.

In [8]:
# Create a Pandas DataFrame from a HTML file:
# pip install lxml

# Read the HTML file from a URL:
url = "https://www.gov.br/receitafederal/pt-br/assuntos/meu-imposto-de-renda/tabelas/2023/"
df_html = pd.read_html(url)
df_html[0]

ImportError: lxml not found, please install it

In [None]:
# Get the Unique values of a Pandas DataFrame:

# display(f"df_csv['AAPL_y'].unique(): {df_csv['AAPL_y'].unique()}")
display(f"df_csv['AAPL_y'].nunique(): {df_csv['AAPL_y'].nunique()}")
display(f"df_csv['AAPL_y'].dtype: {df_csv['AAPL_y'].dtype}")

# Difference of dtype and dtypes:
# dtype is a property of a single Series or column of a DataFrame.
# dtypes is a property of a DataFrame and returns the dtypes of each column.

In [None]:
# Get the quantity of unique values of column inside a Pandas DataFrame:

df_csv['AAPL_y'].value_counts()

In [None]:
# Apply a function to a Pandas DataFrame:

def times2(x):
	return x*2

list = [1, 2, 3, 4, 5]
df_list = pd.DataFrame(list).apply(times2) # Does not change the list variable, only the dataframe view
display(df_list)
list

In [None]:
# Applying a lambda function to a Pandas DataFrame:
# df_list
df_list.apply(lambda x: x*2) # Does not change the data, only the dataframe view
display(df_list)

In [None]:
# How to actually multiply the dataframe values by 2:

df_list = df_list.apply(lambda x: x*2) # Changes the dataframe
display(df_list)

In [None]:
# Dataframe columns and Values:

display(df_csv.columns)
display(df_csv.values)

In [None]:
# Verify if there are null values in a Pandas DataFrame:

display(df_csv.isnull().sum())

In [12]:
# Merge two Pandas DataFrames:

first_df = pd.DataFrame({'A':[1, 2, 3, 4], 'B':[5, 6, 7, 8]})
second_df = pd.DataFrame({'C':[9, 10, 11, 12], 'D':[13, 14, 15, 16]})

display(first_df)
display(second_df)

# Merge the two dataframes:
merged_df = pd.merge(first_df, second_df, left_index=True, right_index=True)
display(merged_df)

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


Unnamed: 0,C,D
0,9,13
1,10,14
2,11,15
3,12,16


Unnamed: 0,A,B,C,D
0,1,5,9,13
1,2,6,10,14
2,3,7,11,15
3,4,8,12,16


In [14]:
# Concatenate two Pandas DataFrames:

second_df = pd.DataFrame({'C':[9, 10, 11, 12, 13, 14], 'D':[13, 14, 15, 16, 17, 18]})

concat_df = pd.concat([first_df, second_df])
display(concat_df)

Unnamed: 0,A,B,C,D
0,1.0,5.0,,
1,2.0,6.0,,
2,3.0,7.0,,
3,4.0,8.0,,
0,,,9.0,13.0
1,,,10.0,14.0
2,,,11.0,15.0
3,,,12.0,16.0
4,,,13.0,17.0
5,,,14.0,18.0


In [18]:
# GroupBy in Pandas:
# The groupby method allows you to group rows of data together and call aggregate functions.

data = {'Company':['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'], 'Person':['Sam', 'Ricardo', 'Amy', 'Vanessa', 'Carl', 'Sarah'], 'Sales':[200, 120, 340, 124, 243, 350]}
df = pd.DataFrame(data)

display(df)

by_company = df.groupby('Company')

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Ricardo,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [26]:
# Filtering a pandas dataframe values:

display(f"df.columns:{df.columns}")
display(f"df.values: {df.values}")

# Filter values in sales lower than 200:
display(df[df['Sales'] < 200])


"df.columns:Index(['Company', 'Person', 'Sales'], dtype='object')"

"df.values: [['GOOG' 'Sam' 200]\n ['GOOG' 'Ricardo' 120]\n ['MSFT' 'Amy' 340]\n ['MSFT' 'Vanessa' 124]\n ['FB' 'Carl' 243]\n ['FB' 'Sarah' 350]]"

Unnamed: 0,Company,Person,Sales
1,GOOG,Ricardo,120
3,MSFT,Vanessa,124
