# **Pandas**
There are two data structures in pandas

**Series**
1. single column data
2. Series will bear index
**DataFrame**
1. Data is represented in rows and columns
2. Dataframe will also bear index
3. A data frame can also have just 1 column, but series can have multiple column

In [1]:
import pandas as pd

In [4]:
# Series

s1 = pd.Series([10, 20, 30, 40, 45, 56, 67])
print(s1)
print(type(s1))

0    10
1    20
2    30
3    40
4    45
5    56
6    67
dtype: int64
<class 'pandas.core.series.Series'>


In [3]:
print(s1.index.tolist())
print(s1.values)

[0, 1, 2, 3, 4, 5, 6]
[10 20 30 40 45 56 67]


In [6]:
print(s1[4])
print(s1[2:6])

45
2    30
3    40
4    45
5    56
dtype: int64


In [13]:
print("Sum : ", s1.sum())
print("Mean : ", s1.mean())
print("Median : ", s1.median())
print("Minimum : ", s1.min())
print("Maximum : ", s1.max())
print("\nCumulative Sum : \n", s1.cumsum())
print("\nCumulative Product : \n", s1.cumprod())

Sum :  268
Mean :  38.285714285714285
Median :  40.0
Minimum :  10
Maximum :  67

Cumulative Sum : 
 0     10
1     30
2     60
3    100
4    145
5    201
6    268
dtype: int64

Cumulative Product : 
 0             10
1            200
2           6000
3         240000
4       10800000
5      604800000
6    40521600000
dtype: int64


In [33]:
data = {"Name":["Ankit", "Raj", "Kartik", "Rishabh", "Anurag", "Shivam", "Hitesh"],
        "Age":[22, 24, 25, 23, 24, 25, 26],
        "City":["Delhi", "Noida", "Mumbai", "Delhi", "Noida", "Delhi", "Delhi"],}

print(data)
print(type(data))

{'Name': ['Ankit', 'Raj', 'Kartik', 'Rishabh', 'Anurag', 'Shivam', 'Hitesh'], 'Age': [22, 24, 25, 23, 24, 25, 26], 'City': ['Delhi', 'Noida', 'Mumbai', 'Delhi', 'Noida', 'Delhi', 'Delhi']}
<class 'dict'>


In [34]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Ankit,22,Delhi
1,Raj,24,Noida
2,Kartik,25,Mumbai
3,Rishabh,23,Delhi
4,Anurag,24,Noida
5,Shivam,25,Delhi
6,Hitesh,26,Delhi


In [35]:
# shape and data types

print(df.shape)  #(row, column)
print(df.dtypes)

(7, 3)
Name    object
Age      int64
City    object
dtype: object


In [36]:
# Index of list

df.index

RangeIndex(start=0, stop=7, step=1)

In [37]:
# Converting index to list

df.index.tolist()

[0, 1, 2, 3, 4, 5, 6]

In [38]:
# columns of the dataframe

df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [39]:
print(df.values)

[['Ankit' 22 'Delhi']
 ['Raj' 24 'Noida']
 ['Kartik' 25 'Mumbai']
 ['Rishabh' 23 'Delhi']
 ['Anurag' 24 'Noida']
 ['Shivam' 25 'Delhi']
 ['Hitesh' 26 'Delhi']]


In [40]:
# Gives all the information of dataframe
# metadata about data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    7 non-null      object
 1   Age     7 non-null      int64 
 2   City    7 non-null      object
dtypes: int64(1), object(2)
memory usage: 296.0+ bytes


In [41]:
# Describe --> Returns the statistical summary of the column

df.describe()

Unnamed: 0,Age
count,7.0
mean,24.142857
std,1.345185
min,22.0
25%,23.5
50%,24.0
75%,25.0
max,26.0


In [42]:
# head --> It returns the top n rows, top 5 by default

df.head(4)

Unnamed: 0,Name,Age,City
0,Ankit,22,Delhi
1,Raj,24,Noida
2,Kartik,25,Mumbai
3,Rishabh,23,Delhi


In [43]:
# tail --> It returns the bottom n rows, last 5 by default

df.tail(4)

Unnamed: 0,Name,Age,City
3,Rishabh,23,Delhi
4,Anurag,24,Noida
5,Shivam,25,Delhi
6,Hitesh,26,Delhi


In [44]:
# sample --> It returns random n rows, random 1 row by default

df.sample(4)

Unnamed: 0,Name,Age,City
0,Ankit,22,Delhi
4,Anurag,24,Noida
6,Hitesh,26,Delhi
2,Kartik,25,Mumbai


In [45]:
# Fetching a single column --> Series

print(type(df["Name"]))
print(df["Name"])

<class 'pandas.core.series.Series'>
0      Ankit
1        Raj
2     Kartik
3    Rishabh
4     Anurag
5     Shivam
6     Hitesh
Name: Name, dtype: object


In [46]:
# Fetching a single column --> Dataframe

print(type(df[["Name"]]))
print(df[["Name"]])

<class 'pandas.core.frame.DataFrame'>
      Name
0    Ankit
1      Raj
2   Kartik
3  Rishabh
4   Anurag
5   Shivam
6   Hitesh


In [48]:
# Fetching multiple columns --> DataFrame

print(type(df[["Name", "Age", "City"]]))
print(df[["Name", "Age", "City"]])

<class 'pandas.core.frame.DataFrame'>
      Name  Age    City
0    Ankit   22   Delhi
1      Raj   24   Noida
2   Kartik   25  Mumbai
3  Rishabh   23   Delhi
4   Anurag   24   Noida
5   Shivam   25   Delhi
6   Hitesh   26   Delhi


In [49]:
# Adding new column to dataframe
# Number of rows should be same as the dataframe we are adding to

df["Department"] = ["IT", "HR", "Admin", "IT", "Sales", "IT", "Sales"]

In [50]:
df.shape

(7, 4)

In [51]:
import numpy as np
scores = np.random.randint(60, 80, 7)
print(scores)

[68 74 79 65 74 69 66]


In [54]:
df.insert(2, "scores", scores)
df

Unnamed: 0,Name,Age,scores,City,Department
0,Ankit,22,68,Delhi,IT
1,Raj,24,74,Noida,HR
2,Kartik,25,79,Mumbai,Admin
3,Rishabh,23,65,Delhi,IT
4,Anurag,24,74,Noida,Sales
5,Shivam,25,69,Delhi,IT
6,Hitesh,26,66,Delhi,Sales


In [55]:
# value_counts

df["Department"].value_counts()

IT       3
Sales    2
HR       1
Admin    1
Name: Department, dtype: int64

In [56]:
# Prints the unique values

print(df["Department"].unique())

array(['IT', 'HR', 'Admin', 'Sales'], dtype=object)

In [57]:
# Prints the number of unique values

print(df["Department"].nunique())

4


In [58]:
# sorting of dataframe  --> asc

df.sort_values("Name")

Unnamed: 0,Name,Age,scores,City,Department
0,Ankit,22,68,Delhi,IT
4,Anurag,24,74,Noida,Sales
6,Hitesh,26,66,Delhi,Sales
2,Kartik,25,79,Mumbai,Admin
1,Raj,24,74,Noida,HR
3,Rishabh,23,65,Delhi,IT
5,Shivam,25,69,Delhi,IT


In [59]:
# sorting of dataframe  --> des

df.sort_values("scores",ascending=False)

Unnamed: 0,Name,Age,scores,City,Department
2,Kartik,25,79,Mumbai,Admin
1,Raj,24,74,Noida,HR
4,Anurag,24,74,Noida,Sales
5,Shivam,25,69,Delhi,IT
0,Ankit,22,68,Delhi,IT
6,Hitesh,26,66,Delhi,Sales
3,Rishabh,23,65,Delhi,IT


## Filter -->

In [60]:
# extract all record where Age > 23

df[df["Age"]>23]

Unnamed: 0,Name,Age,scores,City,Department
1,Raj,24,74,Noida,HR
2,Kartik,25,79,Mumbai,Admin
4,Anurag,24,74,Noida,Sales
5,Shivam,25,69,Delhi,IT
6,Hitesh,26,66,Delhi,Sales


In [61]:
# extract all record where Age > 23 and city is noida

df[(df["Age"]>23) & (df["City"]=="Noida")]

Unnamed: 0,Name,Age,scores,City,Department
1,Raj,24,74,Noida,HR
4,Anurag,24,74,Noida,Sales


In [64]:
# extract all record where score > 70 or department is not sales

df[(df["scores"]>70) | (df["Department"]!="Sales")]

Unnamed: 0,Name,Age,scores,City,Department
0,Ankit,22,68,Delhi,IT
1,Raj,24,74,Noida,HR
2,Kartik,25,79,Mumbai,Admin
3,Rishabh,23,65,Delhi,IT
4,Anurag,24,74,Noida,Sales
5,Shivam,25,69,Delhi,IT


#### **group by Operation**

In [65]:
# Department wise sum of Scores

df.groupby(["Department"])["scores"].sum()

Department
Admin     79
HR        74
IT       202
Sales    140
Name: scores, dtype: int64

In [66]:
# Department and city wise sum of Scores

df.groupby(["Department", "City"])["scores"].sum()

Department  City  
Admin       Mumbai     79
HR          Noida      74
IT          Delhi     202
Sales       Delhi      66
            Noida      74
Name: scores, dtype: int64

In [67]:
print(len(dir(df)))
print(len(dir(pd)))
print(len(dir(np)))

443
144
603


In [68]:
# converting dataframe to csv

df.to_csv("demo1.csv")

In [69]:
# Reading dataframe

# df1 = pd.read_csv("Path_to_the_file")
df1 = pd.read_csv("demo1.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,scores,City,Department
0,0,Ankit,22,68,Delhi,IT
1,1,Raj,24,74,Noida,HR
2,2,Kartik,25,79,Mumbai,Admin
3,3,Rishabh,23,65,Delhi,IT
4,4,Anurag,24,74,Noida,Sales
