# Data Manipulation and Analysis with Pandas
## Data manipulation and analysis are key tasks in any data science or data analysis project. Pandas provides a wide range of functions for data manipulation and analysis, making it easier to clean, transform, and extract insights from data. In this lesson, we will cover various data manipulation and analysis techniques using Pandas.

In [19]:
import pandas as pd
df = pd.read_csv('data.csv')
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [20]:
df.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [21]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [22]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

In [23]:
## Handling missing values
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [24]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [25]:
df_filled = df.fillna(0)

In [26]:
## filling the missing value with mean of the column
df['Sales_fillNA'] = df['Sales'].fillna(df['Sales'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0


In [27]:
df.dtypes

Date             object
Category         object
Value           float64
Product          object
Sales           float64
Region           object
Sales_fillNA    float64
dtype: object

In [29]:
## Renaming columns
df = df.rename(columns={'Date': 'Sales Date'})
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [32]:
## change data type
df['Value_new'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26


In [35]:
df['New Value'] = df['Value'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0


In [36]:
## Data Aggregating and Grouping
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0


In [37]:
grouped_mean = df.groupby('Product')['Value'].mean()
print(grouped_mean)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64


In [38]:
grouped_sum = df.groupby(['Product', 'Region'])['Value'].sum()
print(grouped_sum)

Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64


In [39]:
df.groupby(['Product', 'Region'])['Value'].mean()

Product   Region
Product1  East      41.714286
          North      4.500000
          South     50.000000
          West      82.000000
Product2  East      28.000000
          North     63.500000
          South     60.333333
          West      53.500000
Product3  East      50.500000
          North     40.600000
          South     71.666667
          West      62.166667
Name: Value, dtype: float64

In [42]:
## Aggregate multiple function
grouped_agg = df.groupby('Region')['Value'].agg(['mean', 'sum', 'count'])
grouped_agg

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,42.307692,550.0,13
North,37.666667,339.0,9
South,62.0,496.0,8
West,61.588235,1047.0,17


In [50]:
## Merging and Joining DataFrame
# Creating sample DataFrames
df1 = pd.DataFrame({'Key':['A','B','C'], 'Value1':[1,2,3]})
df2 = pd.DataFrame({'Key':['A','B','D'], 'Value2':[4,5,6]})

In [51]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [52]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [54]:
## Merge DataFrame on 'Key' Columns
pd.merge(df1, df2, on='Key', how='outer')

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [56]:
pd.merge(df1, df2, on='Key', how='left')

Unnamed: 0,Key,Value1,Value2
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [57]:
pd.merge(df1, df2, on='Key', how='right')

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4
1,B,2.0,5
2,D,,6


##### Pandas for 'sales1_data.csv' dataset

In [86]:
import pandas as pd
df = pd.read_csv('sales1_data.csv')
df.head(5)

Unnamed: 0,Date,Product,Sales,Region
0,2023-01-01,Product3,738.0,West
1,2023-01-02,Product2,868.0,North
2,2023-01-03,Product2,554.0,West
3,2023-01-04,Product1,618.0,South
4,2023-01-05,Product3,501.0,East


In [87]:
df.tail(5) ## last 5

Unnamed: 0,Date,Product,Sales,Region
20,2023-01-21,Product2,,West
21,2023-01-22,Product3,296.0,West
22,2023-01-23,Product2,737.0,West
23,2023-01-24,Product3,531.0,East
24,2023-01-25,Product2,834.0,North


In [88]:
df.describe() # statistical values/measures

Unnamed: 0,Sales
count,19.0
mean,554.0
std,231.543372
min,124.0
25%,381.5
50%,554.0
75%,737.5
max,868.0


In [89]:
df.dtypes

Date        object
Product     object
Sales      float64
Region      object
dtype: object

In [90]:
df.isnull().any() #returns direct missing values

Date       False
Product    False
Sales       True
Region     False
dtype: bool

In [91]:
df.isnull().sum() # how many missing values in a column

Date       0
Product    0
Sales      6
Region     0
dtype: int64

In [92]:
df_filled = df.fillna(0)

In [93]:
## filling the missing values with mean of the columns
df['Sales_fillNA'] = df['Sales'].fillna(df['Sales'].mean())
df

Unnamed: 0,Date,Product,Sales,Region,Sales_fillNA
0,2023-01-01,Product3,738.0,West,738.0
1,2023-01-02,Product2,868.0,North,868.0
2,2023-01-03,Product2,554.0,West,554.0
3,2023-01-04,Product1,618.0,South,618.0
4,2023-01-05,Product3,501.0,East,501.0
5,2023-01-06,Product1,,West,554.0
6,2023-01-07,Product3,339.0,South,339.0
7,2023-01-08,Product3,280.0,South,280.0
8,2023-01-09,Product2,806.0,North,806.0
9,2023-01-10,Product2,816.0,South,816.0


In [94]:
df.dtypes

Date             object
Product          object
Sales           float64
Region           object
Sales_fillNA    float64
dtype: object

In [95]:
## Rename columns
df = df.rename(columns = {'Date':'Sales Date'})
df.head()

Unnamed: 0,Sales Date,Product,Sales,Region,Sales_fillNA
0,2023-01-01,Product3,738.0,West,738.0
1,2023-01-02,Product2,868.0,North,868.0
2,2023-01-03,Product2,554.0,West,554.0
3,2023-01-04,Product1,618.0,South,618.0
4,2023-01-05,Product3,501.0,East,501.0


In [96]:
## Change datatype
df['Sales_int'] = df['Sales'].fillna(df['Sales'].mean()).astype(int)
df.head()

Unnamed: 0,Sales Date,Product,Sales,Region,Sales_fillNA,Sales_int
0,2023-01-01,Product3,738.0,West,738.0,738
1,2023-01-02,Product2,868.0,North,868.0,868
2,2023-01-03,Product2,554.0,West,554.0,554
3,2023-01-04,Product1,618.0,South,618.0,618
4,2023-01-05,Product3,501.0,East,501.0,501


In [97]:
df['Double Sales'] = df['Sales'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Sales Date,Product,Sales,Region,Sales_fillNA,Sales_int,Double Sales
0,2023-01-01,Product3,738.0,West,738.0,738,1476.0
1,2023-01-02,Product2,868.0,North,868.0,868,1736.0
2,2023-01-03,Product2,554.0,West,554.0,554,1108.0
3,2023-01-04,Product1,618.0,South,618.0,618,1236.0
4,2023-01-05,Product3,501.0,East,501.0,501,1002.0


In [None]:
## Aggregating and Grouping
grouped_mean = df.groupby('Product')['Sales'].mean()
grouped_mean

Product
Product1    408.333333
Product2    677.000000
Product3    506.888889
Name: Sales, dtype: float64

In [None]:
grouped_sum = df.groupby('Product')['Sales'].sum()
grouped_sum

Product
Product1    1225.0
Product2    4739.0
Product3    4562.0
Name: Sales, dtype: float64

In [113]:
df.groupby(['Product', 'Region'])['Sales'].mean()

Product   Region
Product1  South     408.333333
          West             NaN
Product2  North     658.000000
          South     816.000000
          West      645.500000
Product3  East      569.333333
          South     309.500000
          West      558.750000
Name: Sales, dtype: float64

In [112]:
df.groupby(['Product', 'Region']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales Date,Sales,Sales_fillNA,Sales_int,Double Sales
Product,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Product1,South,2023-01-042023-01-152023-01-162023-01-172023-0...,1225.0,2333.0,2333,2450.0
Product1,West,2023-01-06,0.0,554.0,554,0.0
Product2,North,2023-01-022023-01-092023-01-192023-01-25,2632.0,2632.0,2632,5264.0
Product2,South,2023-01-10,816.0,816.0,816,1632.0
Product2,West,2023-01-032023-01-212023-01-23,1291.0,1845.0,1845,2582.0
Product3,East,2023-01-052023-01-122023-01-132023-01-142023-0...,1708.0,2816.0,2816,3416.0
Product3,South,2023-01-072023-01-08,619.0,619.0,619,1238.0
Product3,West,2023-01-012023-01-112023-01-202023-01-22,2235.0,2235.0,2235,4470.0


In [104]:
## Aggregate multiple functions
grouped_agg = df.groupby('Region')['Sales'].agg(['mean', 'sum', 'count'])
grouped_agg

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,569.333333,1708.0,3
North,658.0,2632.0,4
South,443.333333,2660.0,6
West,587.666667,3526.0,6


In [105]:
## Merging and joing dataframes
## Creting dataframes
df1 = pd.DataFrame({'Key':['A','B','C'], 'Value1':[1,2,3]})
df2 = pd.DataFrame({'Key':['A','B','D'], 'Value2':[4,5,6]})

In [106]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [107]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [109]:
## merging df1 and df2 on 'Key' column (common column)
pd.merge(df1, df2, on='Key', how='outer')

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [110]:
pd.merge(df1, df2, on='Key', how='left')

Unnamed: 0,Key,Value1,Value2
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [111]:
pd.merge(df1, df2, on='Key', how='right')

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4
1,B,2.0,5
2,D,,6
