# Pandas - DataFrame and Series

Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame. A Series is a one-dimensional array-like object, while a DataFrame is a two-dimensional, size-mutable, and potentially heterogenous tabular data structure with labeled axes (rows and columns).

In [1]:
# Series 
# A Pandas Series is a one-dimensional array-like object that can hold any data type. It is similar to a column in a table.

import pandas as pd
data=[1,2,3,4,5]
series=pd.Series(data)
print("Series \n",series)
print(type(series))

Series 
 0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [9]:
# Create a Series from dictionary
dict={'a':1,'b':2,'c':3}
series_dict=pd.Series(dict)
print(series_dict)

a    1
b    2
c    3
dtype: int64


In [10]:
data=[10,20,30]
index=['a','b','c']
pd.Series(data,index=index)

a    10
b    20
c    30
dtype: int64

In [11]:
# Dataframe
# create a Dataframe from a dictionary of list

data={
    'Name':['Krish','John','Jack'],
    'Age':[25,30,40],
    'City':['Bangalore','New York','Florida']
}
df=pd.DataFrame(data)
print(df)
print(type(df))

    Name  Age       City
0  Krish   25  Bangalore
1   John   30   New York
2   Jack   40    Florida
<class 'pandas.core.frame.DataFrame'>


In [26]:
# Create a Data frame from a list of dictionaries
data=[
    {'Name':'Krish','Age':35,'City':'Bangalore'},
    {'Name':'Peter','Age':30,'City':'New York'},
    {'Name':'Jack','Age':45,'City':'Las Vegas'},
    {'Name':'John','Age':39,'City':'Mumbai'}
]

df1=pd.DataFrame(data)
print(df1)
print(type(df1))

    Name  Age       City
0  Krish   35  Bangalore
1  Peter   30   New York
2   Jack   45  Las Vegas
3   John   39     Mumbai
<class 'pandas.core.frame.DataFrame'>


In [22]:
df=pd.read_csv('sales_data.csv')
df.head(5)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [23]:
df.tail(5)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
68,10275,22,100.0,4,2904.44,7/23/2004 0:00,Shipped,3,7,2004,...,"67, rue des Cinquante Otages",,Nantes,,44000,France,EMEA,Labrune,Janine,Small
69,10285,47,100.0,9,6484.59,8/27/2004 0:00,Shipped,3,8,2004,...,39323 Spinnaker Dr.,,Cambridge,MA,51247,USA,,Hernandez,Marta,Medium
70,10298,39,96.34,1,3757.26,9/27/2004 0:00,Shipped,3,9,2004,...,"54, rue Royale",,Nantes,,44000,France,EMEA,Schmitt,Carine,Medium
71,10308,34,100.0,2,4043.96,10/15/2004 0:00,Shipped,4,10,2004,...,3758 North Pendale Street,,White Plains,NY,24067,USA,,Frick,Steve,Medium
72,10318,45,100.0,4,5566.5,11/2/2004 0:00,Shipped,4,11,2004,...,7586 Pompton St.,,Allentown,PA,70267,USA,,Yu,Kyung,Medium


In [27]:
# Accessing data from dataframe
df1

Unnamed: 0,Name,Age,City
0,Krish,35,Bangalore
1,Peter,30,New York
2,Jack,45,Las Vegas
3,John,39,Mumbai


In [28]:
df1['Name']

0    Krish
1    Peter
2     Jack
3     John
Name: Name, dtype: object

In [30]:
type(df1['Name'])

pandas.core.series.Series

In [None]:
# loc means label based 
df1.loc[0]

Name        Krish
Age            35
City    Bangalore
Name: 0, dtype: object

In [None]:
# integer position based indexing 
df1.iloc[0]

Name        Krish
Age            35
City    Bangalore
Name: 0, dtype: object

In [39]:
df1.iloc[0][2]

  df1.iloc[0][2]


'Bangalore'

In [43]:
df1

Unnamed: 0,Name,Age,City
0,Krish,35,Bangalore
1,Peter,30,New York
2,Jack,45,Las Vegas
3,John,39,Mumbai


In [42]:
# Accessing a specified element
df1.at[1,'Age']

np.int64(30)

In [44]:
df1.at[2,'Name']

'Jack'

In [46]:
# Accessing a specified element using iat
df1.iat[2,2]

'Las Vegas'

In [47]:
# Data Manipulation with Dataframe
df1

Unnamed: 0,Name,Age,City
0,Krish,35,Bangalore
1,Peter,30,New York
2,Jack,45,Las Vegas
3,John,39,Mumbai


In [49]:
df1['Salary']=[50000,60000,70000,80000]
df1

Unnamed: 0,Name,Age,City,Salary
0,Krish,35,Bangalore,50000
1,Peter,30,New York,60000
2,Jack,45,Las Vegas,70000
3,John,39,Mumbai,80000


In [None]:
# Remove a column 
df1.drop('Salary',axis=1)

Unnamed: 0,Name,Age,City
0,Krish,35,Bangalore
1,Peter,30,New York
2,Jack,45,Las Vegas
3,John,39,Mumbai


In [53]:
df1

Unnamed: 0,Name,Age,City,Salary
0,Krish,35,Bangalore,50000
1,Peter,30,New York,60000
2,Jack,45,Las Vegas,70000
3,John,39,Mumbai,80000


In [54]:
df1.drop('Salary',axis=1,inplace=True) # To delete permanently

In [55]:
df1

Unnamed: 0,Name,Age,City
0,Krish,35,Bangalore
1,Peter,30,New York
2,Jack,45,Las Vegas
3,John,39,Mumbai


In [56]:
# Add age to the column
df1['Age']=df1['Age']+1
df1

Unnamed: 0,Name,Age,City
0,Krish,36,Bangalore
1,Peter,31,New York
2,Jack,46,Las Vegas
3,John,40,Mumbai


In [57]:
df1.drop(0,inplace=True)
df1

Unnamed: 0,Name,Age,City
1,Peter,31,New York
2,Jack,46,Las Vegas
3,John,40,Mumbai


In [60]:
# Display the data types of each column
print("Data types:\n",df.dtypes)

# Describe the Dataframe
print("Statistical summary:\n",df.describe())

# Group by a column and perform an aggregation
# grouped=df.groupby('Category')['Value'].mean()
# print("Mean value by category:\n",grouped)

Data types:
 ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object
Statistical summary:
         ORDERNUMBER  QUANTITYORDERED   PRICEEACH  ORDERLINENUMBER  \
count     73.000000        73.000000   73.000000        73.000000   
mean   10247.205479        35.178082   97.215479         5.493151   
std       89.024647         9.394653    9.11

In [61]:
df.describe()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,QTR_ID,MONTH_ID,YEAR_ID,MSRP
count,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0
mean,10247.205479,35.178082,97.215479,5.493151,5036.988082,2.767123,7.191781,2003.739726,146.630137
std,89.024647,9.394653,9.115088,3.969058,2338.431073,1.172847,3.561591,0.667237,54.258258
min,10103.0,20.0,34.91,1.0,733.11,1.0,1.0,2003.0,95.0
25%,10174.0,27.0,100.0,2.0,3307.77,2.0,4.0,2003.0,95.0
50%,10245.0,36.0,100.0,5.0,4472.0,3.0,8.0,2004.0,118.0
75%,10312.0,42.0,100.0,9.0,6120.34,4.0,11.0,2004.0,214.0
max,10424.0,66.0,100.0,17.0,12001.0,4.0,12.0,2005.0,214.0
