# Pandas Tutorial

Following [[Pandas Tutorial 2021]] by [[Derek Banas]] on [[Derek Banas (YT channel)]]

(But using hy-lang rather than python)

In [75]:
; Imports
(import numpy :as np)
(import pandas :as pd)

## Series

In [76]:
; Create a Pandas Series using two lists, one for the data and one for the index
(setv list1 ["a" "b" "c" "d"])
(setv labels [1, 2, 3, 4])

(setv ser_1 (pd.Series :data list1 :index labels))

ser_1

1    a
2    b
3    c
4    d
dtype: object

In [77]:
; Create a Pandas Series directly from a numpy array-- indices default to 0 indexed
(setv arr_1 (np.array [1, 2, 3, 4]))
(setv ser_2 (pd.Series arr_1))

ser_2

0    1
1    2
2    3
3    4
dtype: int64

In [78]:
; Arithmetic operations on Pandas series default to array operations
(setv ser_3 (pd.Series (. np (array [5, 6, 7, 8]))))

(+ ser_2 ser_3)

0     6
1     8
2    10
3    12
dtype: int64

In [79]:
; Series key on index when performing array operations
; - In this case, the index 3 gets added, but the others don't because they have nothing to match up with
(setv ser_4 (pd.Series :data [5, 6, 7, 8] :index [3, 4, 5, 6]))
(setv ser_5 (pd.Series :data [1 2 3 4] :index [0 1 2 3]))

(print ser_4)
(print ser_5)

(+ ser_4 ser_5)

3    5
4    6
5    7
6    8
dtype: int64
0    1
1    2
2    3
3    4
dtype: int64


0    NaN
1    NaN
2    NaN
3    9.0
4    NaN
5    NaN
6    NaN
dtype: float64

In [80]:
; Create a Pandas Series directly from a python dict
(setv dict_1 {:f_name "Caleb" :l_name "Figgers" :age 31})

(setv ser_6 (pd.Series dict_1))
ser_6

:f_name      Caleb
:l_name    Figgers
:age            31
dtype: object

In [81]:
; Accessing data from series
(get ser_6 0) 

; (get ser_3 :l_name) 
; => Doesn't work. 

'Caleb'

In [82]:
(setv dict2 {"a" 1 "b" 2})
(setv ser_7 (pd.Series dict2 :name "a_named_series"))

(print ser_7)
(get ser_7 "a")

a    1
b    2
Name: a_named_series, dtype: int64


1

## DataFrames

### Creating DataFrames

In [83]:
; Create a DataFrame from a two-dimensional Numpy array
(setv arr_2 (np.random.randint 10 50 :size #(2 3)))

; Provide row and column labels by passing lists
(setv df_1 (pd.DataFrame arr_2 ["A" "B"] ["C" "D" "E"]))

df_1

Unnamed: 0,C,D,E
A,26,41,19
B,11,10,28


In [84]:
; Create a DataFrame from multiple pandas Series's inside a Python dictionary
(setv dict_3 {"one" (pd.Series [1., 2., 3.] :index ["a" "b" "c"])
              "two" (pd.Series [1., 2., 3., 4.] :index ["a" "b" "c" "d"])})

(setv df_2 (pd.DataFrame dict_3))
df_2

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [85]:
; Create a DataFrame directly from a Python dictionary using .from_dict
(setv df_3 (pd.DataFrame.from_dict (dict [#("A" [1 2 3]) 
                                          #("B" [4 5 6])])))
df_3

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [86]:
; Change orientation of a DataFrame by passing :orient "index"; rename columns by passing a list
(setv df_4 (pd.DataFrame.from_dict (dict [#("A" [1 2 3]) 
                                          #("B" [4 5 6])])
                                   :orient "index"
                                   :columns ["one" "two" "three"]))
df_4                                   

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [87]:
; Find out number of rows and columns with [DataFrame].shape
df_1.shape

(2, 3)

### Retrieving Data

In [88]:
df_1


Unnamed: 0,C,D,E
A,26,41,19
B,11,10,28


In [89]:
; Get a single column by passing in the key
(get df_1 "C")


A    26
B    11
Name: C, dtype: int64

In [90]:
; Get multiple columns by passing a list of column keys
(get df_1 ["C" "E"])

Unnamed: 0,C,E
A,26,19
B,11,28


In [91]:
; Get a row by passing the row key to [DataFrame].loc
(get df_1.loc "A")


C    26
D    41
E    19
Name: A, dtype: int64

In [92]:
; Get a row by passing the index # to [DataFrame].iloc
(get df_1.iloc 1)


C    11
D    10
E    28
Name: B, dtype: int64

In [93]:
; Get specific cell by passing two arguments to [DataFrame].loc
(get df_1.loc "A" "C")

26

In [94]:
; Get multiple cells by passing multiple lists; distributive
(get df_1.loc ["A" "B"] ["C" "D"])

Unnamed: 0,C,D
A,26,41
B,11,10


### Editing Data

In [95]:
; Add a column by simply assigning a value to a column that doesn't
; exist
(setv (get df_1 "Total") (+ (get df_1 "C") 
                            (get df_1 "D") 
                            (get df_1 "E")))

df_1

Unnamed: 0,C,D,E,Total
A,26,41,19,86
B,11,10,28,49


In [96]:
; Add a row by [DataFrame].append-ing a new dict
(setv dict_2 {"C" 55 "D" 78 "E" 69})
(setv new_row (pd.Series dict_2 :name "F"))

(setv df_1 (df_1.append new_row))
df_1


Unnamed: 0,C,D,E,Total
A,26,41,19,86.0
B,11,10,28,49.0
F,55,78,69,


In [97]:
; Remove columns with [DataFrame].drop and :axis 1

(setv df_1 (df_1.drop "Total" :axis 1))
df_1

Unnamed: 0,C,D,E
A,26,41,19
B,11,10,28
F,55,78,69


In [98]:
; Remove rows with [DataFrame].drop and :axis 0
(setv df_1 (df_1.drop "B" :axis 0))
df_1

Unnamed: 0,C,D,E
A,26,41,19
F,55,78,69


In [99]:
; Add an index by creating a normal column then marking it as an index
(setv (get df_1 "Sex") ["Men" "Women"])
df_1

Unnamed: 0,C,D,E,Sex
A,26,41,19,Men
F,55,78,69,Women


In [100]:

; Equivalent way of creating a new column: [DataFrame].assign
(df_1.assign :Sex ["Men" "Women"])


Unnamed: 0,C,D,E,Sex
A,26,41,19,Men
F,55,78,69,Women


In [101]:
; [DataFrame].assign can take a function as an argument
; - Function is passed in its row
(df_1.assign :example (fn [in] (+ (get in "C")
                                  (get in "D"))))

Unnamed: 0,C,D,E,Sex,example
A,26,41,19,Men,67
F,55,78,69,Women,133


In [102]:

(setv df_1 (df_1.set_index "Sex"))
df_1

Unnamed: 0_level_0,C,D,E
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Men,26,41,19
Women,55,78,69


In [103]:
; Reset index with [DataFrame].reset_index
(df_1.reset_index)

Unnamed: 0,Sex,C,D,E
0,Men,26,41,19
1,Women,55,78,69


In [104]:
; Combine dataframes with [DataFrame].combine_first
(setv df_5 (pd.DataFrame {"A" [np.nan 3. np.nan]}))
(setv df_6 (pd.DataFrame {"A" [8. 9. 2. 4.]}))

(setv df_7 (df_5.combine_first df_6))
df_7

Unnamed: 0,A
0,8.0
1,3.0
2,2.0
3,4.0


### Conditional Selection

In [105]:
; # Example
; Initialize a random NumPy array 
(setv arr_2 (np.random.randint 10 50 :size #(4 3)))

; Create a DataFrame out of the np.array
(setv df_8 (pd.DataFrame arr_2 ["A" "B" "F" "G"] ["C" "D" "E"]))
(print df_8)
(print "\n")

; Values Greater than 40?

(> df_8 30)

    C   D   E
A  44  41  38
B  19  21  17
F  32  33  48
G  45  15  28




Unnamed: 0,C,D,E
A,True,True,True
B,False,False,False
F,True,True,True
G,True,False,False


In [106]:
; Return only values that match a predicate
; - Values that return false are replaced with NaN

(get df_8 (> df_8 30))

Unnamed: 0,C,D,E
A,44.0,41.0,38.0
B,,,
F,32.0,33.0,48.0
G,45.0,,


In [110]:
; Return rows where the cell in one row passes a predicate
;
; Equivalent to:
; SELECT * FROM df_8 WHERE df_8["E"] > 30;
;
; Intuition:
; - `>` function on a column returns a list of booleans
; - That list of booleans tells df_8 which rows to return

(print df_8 "\n")
(print (> (get df_8 "E") 30) "\n")

(get df_8 (> (get df_8 "E") 30))

    C   D   E
A  44  41  38
B  19  21  17
F  32  33  48
G  45  15  28 

A     True
B    False
F     True
G    False
Name: E, dtype: bool 



Unnamed: 0,C,D,E
A,44,41,38
F,32,33,48


In [123]:
; Demonstrating filter by multiple conditions
(setv arr_3 (np.array [[1 2 3] [4 5 6] [7 8 9]]))
(setv df_9 (pd.DataFrame arr_3 ["A" "B" "C"] ["X" "Y" "Z"]))
(print df_9 "\n")

(print (& (> (get df_9 "X") 3) (< (get df_9 "X") 7)) "\n")

(get df_9 (& (> (get df_9 "X") 3) (< (get df_9 "X") 7)))

   X  Y  Z
A  1  2  3
B  4  5  6
C  7  8  9 

A    False
B     True
C    False
Name: X, dtype: bool 



Unnamed: 0,X,Y,Z
B,4,5,6


### File Input/Output

In [130]:
(import pymysql)

In [131]:
; Read a .csv
(setv cs_df (pd.read_csv "ComputerSales.csv"))
(cs_df.head)

Unnamed: 0,Sale ID,Contact,Sex,Age,State,Product ID,Product Type,Sale Price,Profit,Lead,Month,Year
0,1,Paul Thomas,M,43,OH,M01-F0024,Desktop,479.99,143.39,Website,January,2018
1,2,Margo Simms,F,37,WV,GT13-0024,Desktop,1249.99,230.89,Flyer 4,January,2018
2,3,Sam Stine,M,26,PA,I3670,Desktop,649.99,118.64,Website,February,2018
3,4,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,March,2018
4,5,Jessica Elk,F,55,PA,15M-ED,Laptop,699.99,98.09,Flyer 4,March,2018


In [154]:
; Read only a few columns from a .csv
; - Use `:squeeze True` to turn a single column into a Series
(setv cs_df_st_age (pd.read_csv "ComputerSales.csv" 
                                :usecols ["State" "Age"]))
(cs_df_st_age.head)

Unnamed: 0,Age,State
0,43,OH
1,37,WV
2,26,PA
3,35,PA
4,55,PA


In [132]:
; Export a .csv
(cs_df.to_csv "ComputerSales_bak.csv")

In [129]:
; Read a .xlsx 
; - Depends on openpyxl
(setv xlsx_df (pd.read_excel "Financial Sample.xlsx" 0))
(xlsx_df.head)

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month Name,Year
0,Government,Canada,Carretera,,1618.5,3,20,32370.0,0.0,32370.0,16185.0,16185.0,2014-01-01,1,January,2014
1,Government,Germany,Carretera,,1321.0,3,20,26420.0,0.0,26420.0,13210.0,13210.0,2014-01-01,1,January,2014
2,Midmarket,France,Carretera,,2178.0,3,15,32670.0,0.0,32670.0,21780.0,10890.0,2014-06-01,6,June,2014
3,Midmarket,Germany,Carretera,,888.0,3,15,13320.0,0.0,13320.0,8880.0,4440.0,2014-06-01,6,June,2014
4,Midmarket,Mexico,Carretera,,2470.0,3,15,37050.0,0.0,37050.0,24700.0,12350.0,2014-06-01,6,June,2014


In [133]:
; Save to a .xlsx
(xlsx_df.to_excel "Financial Sample_bak.xlsx")

In [139]:
; Connect to and get SQL from MySQL database
(try 
  (setv db_connection (pymysql.connect :db "students"
                                       :user "studentadmin"
                                       :passwd "TurtleDove"
                                       :host "localhost"
                                       :port 3306))
  (setv student_df (pd.read_sql "SELECT * FROM students" 
                                :con db_connection))
  (except [e Exception]
    (print (. "Exception : {}" (format e))))
  (else (db_connection.close)))

Exception : (2003, "Can't connect to MySQL server on 'localhost' ([Errno 111] Connection refused)")


In [138]:
; Insert into a MySQL database
(try 
  (setv db_connection (pymysql.connect :db "students"
                                       :user "studentadmin"
                                       :passwd "TurtleDove"
                                       :host "localhost"
                                       :port 3306))
  (setv cursor (db_connection.cursor))
  (setv insert_stmd "INSERT INTO students VALUES()")
  (cursor.execute insert_stmt)
  (db_connection.commit)
  (setv student_df (pd.read_sql "SELECT * FROM students"
                                :con db_connection))
  (except [e Exception]
    (print (. "Exception : {}" (format e))))
  (else (db_connection.close)))

Exception : (2003, "Can't connect to MySQL server on 'localhost' ([Errno 111] Connection refused)")
