## Data Manipulation with Pandas and Numpy

In [14]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv('data.csv')
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [16]:
df.describe() # Statistical Analysis of Numerical columns

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [17]:
# Handling Missing values
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [26]:
# Filling missing values with mean of the column
df['Sales'] = df['Sales'].fillna(df['Sales'].mean())
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28,Product1,754.0,East
1,2023-01-02,B,39,Product3,110.0,North
2,2023-01-03,C,32,Product2,398.0,East
3,2023-01-04,B,8,Product1,522.0,East
4,2023-01-05,B,26,Product3,869.0,North


In [21]:
# Renaming Columns 
df = df.rename(columns = {'Date' : 'Sales Date'})
df.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [22]:
# Changing datatypes
df['Value'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28,Product1,754.0,East,754.0
1,2023-01-02,B,39,Product3,110.0,North,110.0
2,2023-01-03,C,32,Product2,398.0,East,398.0
3,2023-01-04,B,8,Product1,522.0,East,522.0
4,2023-01-05,B,26,Product3,869.0,North,869.0


In [27]:
# Checking if there exists any missing values
df.isnull().sum()


Sales Date    0
Category      0
Value         0
Product       0
Sales         0
Region        0
dtype: int64

#### We are assured that all NaN values have been replaced using the mean of the column.

## Data Aggregating and Grouping

In [28]:
grouped_mean = df.groupby('Product')['Value'].mean()
print('Grouped Mean of Products: \n')
print(grouped_mean)

Grouped Mean of Products: 

Product
Product1    46.812500
Product2    52.800000
Product3    54.947368
Name: Value, dtype: float64


In [29]:
grouped_sum = df.groupby(['Product', 'Region'])['Value'].sum()
print("Grouped Sum of products based on regions: \n")
print(grouped_sum)

Grouped Sum of products based on regions: 

Product   Region
Product1  East      292
          North      60
          South     100
          West      297
Product2  East       56
          North     127
          South     181
          West      428
Product3  East      202
          North     254
          South     215
          West      373
Name: Value, dtype: int64


In [30]:
# Aggregate multiple functions
grouped_agg = df.groupby('Region')['Value'].agg(['mean', 'sum', 'count'])
print(grouped_agg)

             mean   sum  count
Region                        
East    42.307692   550     13
North   40.090909   441     11
South   62.000000   496      8
West    61.000000  1098     18


In [33]:
# Merging and Joining Dataframes
df1 = pd.DataFrame({'Key' : ['A', 'B', 'C'], 'Value1' : [1, 2, 3]})
df2 = pd.DataFrame({'Key' : ['A', 'B', 'D'], 'Value2' : [4, 5, 6]})


In [34]:
pd.merge(df1, df2, on = 'Key', how = 'outer')

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0
