In [1]:
### Data manipulation with pandas
import pandas as pd
df = pd.read_csv('data.csv')

In [2]:
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [3]:
df.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [4]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

In [5]:
df.isnull()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [7]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [8]:
df_filled = df.fillna(0)
df_filled.isnull().sum()

Date        0
Category    0
Value       0
Product     0
Sales       0
Region      0
dtype: int64

In [9]:
## filling missing values with the mean of the column
df['filled_val'] = df['Value'].fillna(df['Value'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,filled_val
0,2023-01-01,A,28.0,Product1,754.0,East,28.0
1,2023-01-02,B,39.0,Product3,110.0,North,39.0
2,2023-01-03,C,32.0,Product2,398.0,East,32.0
3,2023-01-04,B,8.0,Product1,522.0,East,8.0
4,2023-01-05,B,26.0,Product3,869.0,North,26.0
5,2023-01-06,B,54.0,Product3,192.0,West,54.0
6,2023-01-07,A,16.0,Product1,936.0,East,16.0
7,2023-01-08,C,89.0,Product1,488.0,West,89.0
8,2023-01-09,C,37.0,Product3,772.0,West,37.0
9,2023-01-10,A,22.0,Product2,834.0,West,22.0


In [10]:
df.dtypes

Date           object
Category       object
Value         float64
Product        object
Sales         float64
Region         object
filled_val    float64
dtype: object

In [13]:
## Renaming columns
df = df.rename(columns={'Date':'Sale_Date'})
df.head(6)

Unnamed: 0,Sale_Date,Category,Value,Product,Sales,Region,filled_val
0,2023-01-01,A,28.0,Product1,754.0,East,28.0
1,2023-01-02,B,39.0,Product3,110.0,North,39.0
2,2023-01-03,C,32.0,Product2,398.0,East,32.0
3,2023-01-04,B,8.0,Product1,522.0,East,8.0
4,2023-01-05,B,26.0,Product3,869.0,North,26.0
5,2023-01-06,B,54.0,Product3,192.0,West,54.0


In [14]:
df['filled_val'] = df['filled_val'].astype(int)
df.head()

Unnamed: 0,Sale_Date,Category,Value,Product,Sales,Region,filled_val
0,2023-01-01,A,28.0,Product1,754.0,East,28
1,2023-01-02,B,39.0,Product3,110.0,North,39
2,2023-01-03,C,32.0,Product2,398.0,East,32
3,2023-01-04,B,8.0,Product1,522.0,East,8
4,2023-01-05,B,26.0,Product3,869.0,North,26


In [16]:
df['New_Value'] = df['filled_val'].apply(lambda x : x * 2)
df.head()

Unnamed: 0,Sale_Date,Category,Value,Product,Sales,Region,filled_val,New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,28,56
1,2023-01-02,B,39.0,Product3,110.0,North,39,78
2,2023-01-03,C,32.0,Product2,398.0,East,32,64
3,2023-01-04,B,8.0,Product1,522.0,East,8,16
4,2023-01-05,B,26.0,Product3,869.0,North,26,52


In [17]:
## Data aggregating and grouping
df.head()

Unnamed: 0,Sale_Date,Category,Value,Product,Sales,Region,filled_val,New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,28,56
1,2023-01-02,B,39.0,Product3,110.0,North,39,78
2,2023-01-03,C,32.0,Product2,398.0,East,32,64
3,2023-01-04,B,8.0,Product1,522.0,East,8,16
4,2023-01-05,B,26.0,Product3,869.0,North,26,52


In [18]:
grouped_mean = df.groupby('Product')['Value'].mean()
print(grouped_mean)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64


In [20]:
grouped_sum = df.groupby(['Product','Region'])['Value'].sum()
print(grouped_sum)

Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64


In [21]:
grouped_agg = df.groupby('Region')['Value'].agg(['mean','sum','count'])
grouped_agg

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,42.307692,550.0,13
North,37.666667,339.0,9
South,62.0,496.0,8
West,61.588235,1047.0,17


In [27]:
## Merging and joining DataFrames
df1 = pd.DataFrame({'key': ['a','b','c'], 
                    'Value':[1,2,3]})
df2 = pd.DataFrame({'key': ['c','d','e'], 'value': [4,5,6]})


In [28]:
df1

Unnamed: 0,key,Value
0,a,1
1,b,2
2,c,3


In [29]:
df2

Unnamed: 0,key,value
0,c,4
1,d,5
2,e,6


In [30]:
## Merge DataFrame on the Key columns
pd.merge(df1,df2,on='key',how='inner')

Unnamed: 0,key,Value,value
0,c,3,4


In [31]:
pd.merge(df1,df2,on='key',how = 'outer')

Unnamed: 0,key,Value,value
0,a,1.0,
1,b,2.0,
2,c,3.0,4.0
3,d,,5.0
4,e,,6.0


In [32]:
pd.merge(df1,df2,on='key',how = 'left')

Unnamed: 0,key,Value,value
0,a,1,
1,b,2,
2,c,3,4.0


In [33]:
pd.merge(df1,df2,on='key',how = 'right')

Unnamed: 0,key,Value,value
0,c,3.0,4
1,d,,5
2,e,,6
