# Difficult type questions of pandas practice. #

## We use DataFrame from pandas practice kernel that is being extracted into csv format

In [3]:
# Let's import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv(r'E:\Study\Projects\EDA\pandas_practice_dataframe_output.csv')
df.head()

Unnamed: 0,id,name,region,product,category,total_sales,order_date,price,quantity,date,...,doll_id,light_id,cumulative_sales,sales_category,profit_margin,sales_rank,rolling_profit_average,z-score,currency,average_indicator
0,ID_2,Name_8,East,Arun,Clothing,0.762919,2024-02-15,38.0,1.0,2022-01-02,...,1.0,0.429664,6930.53,Medium,48.884717,2.0,,0.587069,98.0,Above Average
1,ID_5,Name_18,West,Product_8,Food,0.712614,2024-01-24,57.0,0.268817,2022-01-04,...,0.702105,0.456353,13426.93,Medium,73.418817,4.0,,0.434,623.0,Above Average
2,ID_6,Name_16,North,senthalampoo,Food,0.075173,2024-07-10,47.0,0.172043,2022-01-05,...,0.355292,0.576605,14422.29,Low,2.716605,5.0,0.512865,-1.505593,688.0,Below Average
3,ID_7,Name_15,East,Aran,Food,0.545764,2024-09-25,38.0,0.11828,2022-01-07,...,0.706123,0.458541,19478.8,Medium,24.53016,5.0,0.391797,-0.073686,24.0,Below Average
4,ID_8,Name_9,South,rama,Clothing,0.177651,2024-08-27,38.0,0.215054,2022-01-07,...,0.503804,0.220327,21358.54,Low,22.353091,3.0,0.146606,-1.193772,353.0,Below Average


## 1.Write a script to impute missing values in a DataFrame using KNN

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

# Seperate the datetime columns
date_time_features = df.select_dtypes(include=['datetime64']).columns
numerical_features = df.select_dtypes(include=['number']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Encoding categorical variables
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Exclude datetime columns before imputation
df_numeric = df[numerical_features.union(categorical_features)] # keep only numeric and categorical variables

# Apply KNN imputer
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# Decode categorical variables
for colu in label_encoders:
    df_imputed[colu] = df_imputed[colu].round().astype(int)
    df_imputed[colu] = label_encoders[colu].inverse_transform(df_imputed[colu])

print(df_imputed)

   average_indicator    bag_id  board_id   book_id     category  cloth_id  \
0      Above Average  0.613341  0.436119  0.904917     Clothing  0.923711   
1      Above Average  0.294167  1.000000  0.585766         Food  0.887395   
2      Below Average  0.215403  0.697915  0.863709         Food  0.911999   
3      Below Average  0.359435  0.069997  0.262973         Food  0.313493   
4      Below Average  0.561757  0.893666  0.798554     Clothing  0.362644   
5      Above Average  0.884667  0.481267  0.480461  Accessories  0.835687   
6      Above Average  0.473555  0.715940  0.532736    Furniture  0.599012   
7      Below Average  0.799988  0.198517  0.491860  Accessories  0.618824   
8      Below Average  0.100066  0.866201  0.444671  Electronics  0.760513   
9      Below Average  0.420646  0.219301  0.156668         Food  0.319139   
10     Above Average  1.000000  0.330687  0.450373    Furniture  0.785465   
11     Above Average  0.764678  0.365677  0.409638  Accessories  0.870535   

## 2.Create a DataFrame with hierarchical indexes based on region and category and calculate group statistics

In [11]:
# Since we have object and datetime DataType we need to filter it before we do aggregation
dropped_feature = df.drop(columns=['order_date', 'col_62', 'sales_category', 'rolling_profit_average'], inplace=True)

# We first create hierarchical index based on region and category column.
hierarchial_index = df.set_index(['region', 'category'], inplace=True)
hierarchial_df = pd.DataFrame(data=df, index=hierarchial_index, columns=df.columns)
hierarchial_df.groupby(['region', 'category']).agg(['sum', 'mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,id,id,name,name,name,product,product,product,total_sales,...,sales_rank,z-score,z-score,z-score,currency,currency,currency,average_indicator,average_indicator,average_indicator
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,median,sum,mean,median,sum,mean,median,sum,...,median,sum,mean,median,sum,mean,median,sum,mean,median
region,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,1,6,6.0,6.0,16,16.0,16.0,1,1.0,1.0,0.762919,...,2.0,0.587069,0.587069,0.587069,98.0,98.0,98.0,0,0.0,0.0
0,3,35,17.5,17.5,15,7.5,7.5,12,6.0,6.0,1.506824,...,3.0,1.116281,0.558141,0.558141,477.0,238.5,238.5,2,1.0,1.0
0,4,23,7.666667,10.0,31,10.333333,10.0,20,6.666667,8.0,1.277046,...,4.0,-1.31721,-0.43907,0.086008,1390.0,463.333333,462.0,2,0.666667,1.0
1,0,21,21.0,21.0,14,14.0,14.0,5,5.0,5.0,0.643033,...,3.0,0.222282,0.222282,0.222282,241.0,241.0,241.0,0,0.0,0.0
1,2,11,11.0,11.0,13,13.0,13.0,3,3.0,3.0,0.782795,...,2.0,0.647548,0.647548,0.647548,118.0,118.0,118.0,0,0.0,0.0
1,3,21,10.5,10.5,13,6.5,6.5,26,13.0,13.0,0.567046,...,4.5,-1.743256,-0.871628,-0.871628,1284.0,642.0,642.0,2,1.0,1.0
1,4,4,4.0,4.0,6,6.0,6.0,14,14.0,14.0,1.0,...,1.0,1.308454,1.308454,1.308454,220.0,220.0,220.0,0,0.0,0.0
2,1,35,17.5,17.5,18,9.0,9.0,18,9.0,9.0,1.148614,...,2.0,0.026327,0.013163,0.013163,1071.0,535.5,535.5,1,0.5,0.5
2,2,2,2.0,2.0,15,15.0,15.0,13,13.0,13.0,0.0,...,4.0,-1.734326,-1.734326,-1.734326,501.0,501.0,501.0,1,1.0,1.0
2,3,7,7.0,7.0,7,7.0,7.0,4,4.0,4.0,0.787603,...,2.0,0.662177,0.662177,0.662177,239.0,239.0,239.0,1,1.0,1.0
