In [1]:
"""" Pandas 
- Pandas is a Python library for working with tabular data.
- Pandas is used to analyze, clean, explore, and manipulate tabular data.
- Pandas allows us to analyze big data and make conclusions based on statistical theories.
- Installation: pip install pandas

Objectives
==========
- Pandas dataframe
- Create, read, write pandas dataframe
- Dataframe methods and attributes: head, tail,  shape, info, describe,  values, columns
- Sorting
- Selecting columns
- Selecting rows
- Creating new columns
- Data frame Query
"""

import pandas as pd

pd.__version__

'2.2.2'

In [None]:
""" Types of data 
- tabular data (*)
- image 
- video => a set of frames
- audio => 1D array of numbers
- text => 1D array of numbers

ML Steps:
1. Data preparation: raw data --> processed for statistical model
2. Statistical Algorithm
3. Experimentation
"""

In [2]:
""" Pandas dataframe 
- A data structure for tabular data
"""

employee_data_table = {
    "EmployeeId": [1, 2, 3, 4, 5],
    "Name": ["John", "Doe", "Jane", "Smith", "Jack"],
    "Salary": [20000, 30000, 40000, 50000, 60000],
}

employee_df = pd.DataFrame(
    employee_data_table
)

employee_df.head(3)

Unnamed: 0,EmployeeId,Name,Salary
0,1,John,20000
1,2,Doe,30000
2,3,Jane,40000


In [3]:
employee_df.tail()

Unnamed: 0,EmployeeId,Name,Salary
0,1,John,20000
1,2,Doe,30000
2,3,Jane,40000
3,4,Smith,50000
4,5,Jack,60000


In [4]:
employee_df.shape

(5, 3)

In [28]:
employee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   EmployeeId  5 non-null      int64 
 1   Name        5 non-null      object
 2   Salary      5 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 252.0+ bytes


In [30]:
employee_df.describe()

Unnamed: 0,EmployeeId,Salary
count,5.0,5.0
mean,3.0,40000.0
std,1.581139,15811.388301
min,1.0,20000.0
25%,2.0,30000.0
50%,3.0,40000.0
75%,4.0,50000.0
max,5.0,60000.0


In [5]:
print(employee_df.values)
print(type(employee_df.values))

[[1 'John' 20000]
 [2 'Doe' 30000]
 [3 'Jane' 40000]
 [4 'Smith' 50000]
 [5 'Jack' 60000]]
<class 'numpy.ndarray'>


In [33]:
print(type(employee_df.values))

<class 'numpy.ndarray'>


In [35]:
employee_df.columns = ["Id", "Name", "Salary"]

In [36]:
employee_df.head()

Unnamed: 0,Id,Name,Salary
0,1,John,20000
1,2,Doe,30000
2,3,Jane,40000
3,4,Smith,50000
4,5,Jack,60000


In [41]:
""" Read data from file """

import os

root_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(root_dir, "data")
dataset_path = os.path.join(data_dir, "homelessness.csv")

print("root_dir", root_dir)
print("data_dir", data_dir)
print("dataset_path", dataset_path)

df = pd.read_csv(dataset_path)
df.head()

root_dir E:\PyCharmProjects\pythonProject
data_dir E:\PyCharmProjects\pythonProject\data
dataset_path E:\PyCharmProjects\pythonProject\data\homelessness.csv


Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681
1,1,Pacific,Alaska,1434.0,582.0,735139
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588


In [40]:
os.path.dirname(os.getcwd())

'E:\\PyCharmProjects\\pythonProject'

In [43]:
""" Sorting data """

sorted_df_by_individuals = df.sort_values(by="individuals", ascending=False)
sorted_df_by_individuals.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
4,4,Pacific,California,109008.0,20964.0,39461588
32,32,Mid-Atlantic,New York,39827.0,52070.0,19530351
9,9,South Atlantic,Florida,21443.0,9587.0,21244317
43,43,West South Central,Texas,19199.0,6111.0,28628666
47,47,Pacific,Washington,16424.0,5880.0,7523869


In [53]:
sorted_df_by_multi_cols = df.sort_values(by=["region", "family_members"], ascending=[True, False])
sorted_df_by_multi_cols.head(10)

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
13,13,East North Central,Illinois,6752.0,3891.0,12723071
35,35,East North Central,Ohio,6929.0,3320.0,11676341
22,22,East North Central,Michigan,5209.0,3142.0,9984072
49,49,East North Central,Wisconsin,2740.0,2167.0,5807406
14,14,East North Central,Indiana,3776.0,1482.0,6695497
42,42,East South Central,Tennessee,6139.0,1744.0,6771631
17,17,East South Central,Kentucky,2735.0,953.0,4461153
0,0,East South Central,Alabama,2570.0,864.0,4887681
24,24,East South Central,Mississippi,1024.0,328.0,2981020
32,32,Mid-Atlantic,New York,39827.0,52070.0,19530351


In [47]:
""" Select a columns 
- Feature engineering: selecting features 
- Every column is a feature in ML
- Every row is a example / sample
"""

individuals = df['individuals']
print(individuals)

0       2570.0
1       1434.0
2       7259.0
3       2280.0
4     109008.0
5       7607.0
6       2280.0
7        708.0
8       3770.0
9      21443.0
10      6943.0
11      4131.0
12      1297.0
13      6752.0
14      3776.0
15      1711.0
16      1443.0
17      2735.0
18      2540.0
19      1450.0
20      4914.0
21      6811.0
22      5209.0
23      3993.0
24      1024.0
25      3776.0
26       983.0
27      1745.0
28      7058.0
29       835.0
30      6048.0
31      1949.0
32     39827.0
33      6451.0
34       467.0
35      6929.0
36      2823.0
37     11139.0
38      8163.0
39       747.0
40      3082.0
41       836.0
42      6139.0
43     19199.0
44      1904.0
45       780.0
46      3928.0
47     16424.0
48      1021.0
49      2740.0
50       434.0
Name: individuals, dtype: float64


In [50]:
new_df = df[ ['region', 'family_members'] ]
new_df.head()

Unnamed: 0,region,family_members
0,East South Central,864.0
1,Pacific,582.0
2,Mountain,2606.0
3,West South Central,432.0
4,Pacific,20964.0


In [51]:
new_df = df[ ['region', 'family_members'] ]
new_df.head()

Unnamed: 0,region,family_members
0,East South Central,864.0
1,Pacific,582.0
2,Mountain,2606.0
3,West South Central,432.0
4,Pacific,20964.0


In [55]:
df_mountain = df[ df['region'] == "Mountain" ]
df_mountain.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
2,2,Mountain,Arizona,7259.0,2606.0,7158024
5,5,Mountain,Colorado,7607.0,3250.0,5691287
12,12,Mountain,Idaho,1297.0,715.0,1750536
26,26,Mountain,Montana,983.0,422.0,1060665
28,28,Mountain,Nevada,7058.0,486.0,3027341


In [56]:
df.describe()

Unnamed: 0.1,Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0,51.0
mean,25.0,7225.784314,3504.882353,6405637.0
std,14.866069,15991.025083,7805.411811,7327258.0
min,0.0,434.0,75.0,577601.0
25%,12.5,1446.5,592.0,1777414.0
50%,25.0,3082.0,1482.0,4461153.0
75%,37.5,6781.5,3196.0,7340946.0
max,50.0,109008.0,52070.0,39461590.0


In [57]:
df_gt_10k = df[ df['individuals'] > 10000]
df_gt_10k.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
4,4,Pacific,California,109008.0,20964.0,39461588
9,9,South Atlantic,Florida,21443.0,9587.0,21244317
32,32,Mid-Atlantic,New York,39827.0,52070.0,19530351
37,37,Pacific,Oregon,11139.0,3337.0,4181886
43,43,West South Central,Texas,19199.0,6111.0,28628666


In [58]:
is_family_lt_1k_mask = df['family_members'] < 10000
is_region_pacific_mask = df['region'] == 'Pacific'

new_df = df[ is_family_lt_1k_mask & is_region_pacific_mask ]
new_df.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
1,1,Pacific,Alaska,1434.0,582.0,735139
11,11,Pacific,Hawaii,4131.0,2399.0,1420593
37,37,Pacific,Oregon,11139.0,3337.0,4181886
47,47,Pacific,Washington,16424.0,5880.0,7523869


In [59]:
""" Creating new columns 
- H, W, is_Healthy
- BMI can be calculated from H, W
- You can add a new feature / column
"""

df["total_pop"] = df["individuals"] + df["state_pop"]

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop,total_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681,4890251.0
1,1,Pacific,Alaska,1434.0,582.0,735139,736573.0
2,2,Mountain,Arizona,7259.0,2606.0,7158024,7165283.0
3,3,West South Central,Arkansas,2280.0,432.0,3009733,3012013.0
4,4,Pacific,California,109008.0,20964.0,39461588,39570596.0


In [64]:
""" Query """

new_df = df.query(
    "region == 'Pacific' and family_members >= 10000"
)

new_df.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop,total_pop
4,4,Pacific,California,109008.0,20964.0,39461588,39570596.0
