# Pandas 
## Pandas is an open source,  BSC-licensed library providing high-performance, easy-to-use data structure and data analysis tools for the python programming language.


## Agenda
### - What is Data Frames?
### - What is Data Series?
### - Different operation in Pandas

In [5]:
## first step is to import pandas
import pandas as pd 
import numpy as np

In [7]:
## Playing with data Frames -- make the data in rows and columns

df = pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=['column1','column2','column3','column4'])

In [5]:
df.head()

Unnamed: 0,column1,column2,column3,column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [42]:
df[['column1','column2']]

Unnamed: 0,column1,column2
Row1,0,1
Row2,4,5
Row3,8,9
Row4,12,13
Row5,16,17


In [7]:
df.to_csv('Test1.csv')  #comma separated value

In [9]:
## Accessing the elements
## 1. .loc 2. .iloc - index location

df.loc['Row1']

column1    0
column2    1
column3    2
column4    3
Name: Row1, dtype: int32

In [10]:
type(df.loc['Row1'])

pandas.core.series.Series

In [13]:
df.iloc[3:,2:]  #01234 row, 0123 column

Unnamed: 0,column3,column4
Row4,14,15
Row5,18,19


In [14]:
df.iloc[0,0]

0

In [18]:
df.iloc[0:2,0:2]  #row column 1 and 2

Unnamed: 0,column1,column2
Row1,0,1
Row2,4,5


In [20]:
type(df.iloc[0:2,0:2])

pandas.core.frame.DataFrame

In [25]:
#change dataframe in array
df.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [28]:
#important
df.isnull().sum()

column1    0
column2    0
column3    0
column4    0
dtype: int64

In [38]:
df['column1'].value_counts()

0     1
4     1
8     1
12    1
16    1
Name: column1, dtype: int64

In [10]:
# df['colunm1'].unique()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row1 to Row5
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   column1  5 non-null      int32
 1   column2  5 non-null      int32
 2   column3  5 non-null      int32
 3   column4  5 non-null      int32
dtypes: int32(4)
memory usage: 120.0+ bytes


In [None]:
# coma separated file csv 

In [11]:
test_df = pd.read_csv('test1.csv')

In [12]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,column1,column2,column3,column4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19


In [17]:
# making another csv file

In [14]:
df.to_csv('test2.csv')

In [23]:
test1_df= pd.read_csv('test2.csv',sep=';')

In [24]:
test1_df.head()

Unnamed: 0.1,Unnamed: 0,column1,column2,column3,column4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19


In [25]:
test1_df.describe()

Unnamed: 0,column1,column2,column3,column4
count,5.0,5.0,5.0,5.0
mean,8.0,9.0,10.0,11.0
std,6.324555,6.324555,6.324555,6.324555
min,0.0,1.0,2.0,3.0
25%,4.0,5.0,6.0,7.0
50%,8.0,9.0,10.0,11.0
75%,12.0,13.0,14.0,15.0
max,16.0,17.0,18.0,19.0


## CSV

In [26]:
from io import StringIO, BytesIO

In [27]:
data = ('col1,col2,col3\n'
           'x,y,1\n'
           'a,b,2\n'
           'c,d,3\n')

In [28]:
type(data)

str

In [32]:
StringIO()

<_io.StringIO at 0x15d80034ca0>

In [35]:
df=pd.read_csv(StringIO(data))

In [36]:
df

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [37]:
#read from specific columns

In [39]:
df=pd.read_csv(StringIO(data), usecols=['col1','col2'])

In [40]:
df

Unnamed: 0,col1,col2
0,x,y
1,a,b
2,c,d


In [41]:
data=('a,b,c,d\n'
         '1,2,3,4\n'
         '5,6,7,8\n'
         '9,10,11,12')

In [43]:
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11,12


In [49]:
df = pd.read_csv(StringIO(data),dtype=float)

In [50]:
df

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,12.0


In [51]:
df['a']

0    1.0
1    5.0
2    9.0
Name: a, dtype: float64

In [58]:
df = pd.read_csv(StringIO(data),dtype={'a':'Int64','b':int,'c':float})

In [59]:
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,5,6,7.0,8
2,9,10,11.0,12


In [61]:
type(df['a'][1])

numpy.int64

In [63]:
type(df['c'][2])

numpy.float64

In [64]:
type(df['b'][0])

numpy.int32

In [None]:
#check the datatype of all the columns

In [66]:
df.dtypes

a      Int64
b      int32
c    float64
d      int64
dtype: object

In [67]:
data=('index,a,b,c\n'
         '4,apple,bat,3.5\n'
         '5,banana,cat,3.6')

In [68]:
print(data)

index,a,b,c
4,apple,bat,3.5
5,banana,cat,3.6


In [75]:
df = pd.read_csv(StringIO(data),index_col=0)  # 0th colunm become the index colunm

In [76]:
df

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,3.5
5,banana,cat,3.6


In [77]:
data=('a,b,c\n'
        '4,corona,bat\n'
        '6,milk,cow\n')

In [78]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,4,corona,bat
1,6,milk,cow


In [79]:
pd.read_csv(StringIO(data),usecols=['b','c'],index_col=False)

Unnamed: 0,b,c
0,corona,bat
1,milk,cow


In [80]:
data = 'a,b\n"hello, \\"Bob\\",nice to see you",5'

In [84]:
pd.read_csv(StringIO(data),escapechar='\\')

Unnamed: 0,a,b
0,"hello, ""Bob"",nice to see you",5


# Read Json to CSV

In [9]:
data='{"student_name":"Kavita","email":"chaubey@gmail.com","learning":[{"title1":"HTML","title2":"PYTHON"}]}'
df1=pd.read_json(data)

In [10]:
#json to csv == df.to_csv('')
#json to different formats == df.to_json(orient='')   --convert the object to a json string

In [11]:
df1

Unnamed: 0,student_name,email,learning
0,Kavita,chaubey@gmail.com,"{'title1': 'HTML', 'title2': 'PYTHON'}"


In [12]:
df1.to_json()

'{"student_name":{"0":"Kavita"},"email":{"0":"chaubey@gmail.com"},"learning":{"0":{"title1":"HTML","title2":"PYTHON"}}}'

In [16]:
df1.to_json(orient="split")

'{"columns":["student_name","email","learning"],"index":[0],"data":[["Kavita","chaubey@gmail.com",{"title1":"HTML","title2":"PYTHON"}]]}'

In [17]:
df1.to_json(orient="records")

'[{"student_name":"Kavita","email":"chaubey@gmail.com","learning":{"title1":"HTML","title2":"PYTHON"}}]'

In [20]:
df1.to_json(orient="index")

'{"0":{"student_name":"Kavita","email":"chaubey@gmail.com","learning":{"title1":"HTML","title2":"PYTHON"}}}'

In [21]:
df1.to_json(orient="table")

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"student_name","type":"string"},{"name":"email","type":"string"},{"name":"learning","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"student_name":"Kavita","email":"chaubey@gmail.com","learning":{"title1":"HTML","title2":"PYTHON"}}]}'

In [22]:
df1.to_json(orient="columns")

'{"student_name":{"0":"Kavita"},"email":{"0":"chaubey@gmail.com"},"learning":{"0":{"title1":"HTML","title2":"PYTHON"}}}'

In [23]:
df1.to_json(orient="values")

'[["Kavita","chaubey@gmail.com",{"title1":"HTML","title2":"PYTHON"}]]'

# Reading HTML content