### Pandas Tutorial

Pandas is an open-source, BSD-licensed library providing high-performance, easy-to-use data structure and data analysis tools for Python programming language.

Agenda

- What is Data Frames?
- What is Data Series?
- Diffrent operation in Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.arange(0,20).reshape(5,4), index = ['row1', 'row2','row3','row4','row5'],
                  columns = ['column1', 'column2','column3', 'column4'])
df

Unnamed: 0,column1,column2,column3,column4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15
row5,16,17,18,19


In [3]:
#accessing the elements

#1- .loc
#2- .iloc
type(df.loc['row1'])

pandas.core.series.Series

In [4]:
type(df.iloc[:3,:2])

pandas.core.frame.DataFrame

In [5]:
df['column1']

row1     0
row2     4
row3     8
row4    12
row5    16
Name: column1, dtype: int64

In [6]:
df.iloc[:3,:2].values

array([[0, 1],
       [4, 5],
       [8, 9]])

In [7]:
df['column1'].value_counts()

column1
0     1
4     1
8     1
12    1
16    1
Name: count, dtype: int64

In [8]:
df.isnull().sum()

column1    0
column2    0
column3    0
column4    0
dtype: int64

In [9]:
df = pd.read_csv("mercedesbenz.csv")
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [11]:
df.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
df.describe(include ='object')

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
count,4209,4209,4209,4209,4209,4209,4209,4209
unique,47,27,44,7,4,29,12,25
top,z,aa,as,c,d,w,g,j
freq,360,833,1659,1942,4205,231,1042,277


In [13]:
df['X0'].value_counts().reset_index().head()

Unnamed: 0,X0,count
0,z,360
1,ak,349
2,y,324
3,ay,313
4,t,306


#### CSV

In [14]:
from io import StringIO, BytesIO

In [15]:
data = ('col1,col2,col3\n'
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3')

In [16]:
type(data)

str

In [17]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [18]:
# read from specific columns
df = pd.read_csv(StringIO(data), usecols = ['col1','col2'])
df

Unnamed: 0,col1,col2
0,x,y
1,a,b
2,c,d


In [1]:
import pandas as pd

# Load the JSON data from a file
json_data = {
    "employees": [
        {
            "name": "John",
            "age": 30,
            "address": {
                "city": "New York",
                "street": "123 Main St"
            },
            "skills": ["Python", "SQL"]
        },
        {
            "name": "Jane",
            "age": 28,
            "address": {
                "city": "Los Angeles",
                "street": "456 Elm St"
            },
            "skills": ["Java", "C#"]
        }
    ]
}

# Normalize the JSON data
df = pd.json_normalize(json_data, record_path='employees')

# Display the DataFrame
print(df)


   name  age         skills address.city address.street
0  John   30  [Python, SQL]     New York    123 Main St
1  Jane   28     [Java, C#]  Los Angeles     456 Elm St
