# Difficult type questions of pandas practice. #

## We use DataFrame from pandas practice kernel that is being extracted into csv format

In [3]:
# Let's import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv(r'E:\Study\Projects\EDA\pandas_practice_dataframe_output.csv')
df.head()

Unnamed: 0,id,name,region,product,category,total_sales,order_date,price,quantity,date,...,doll_id,light_id,cumulative_sales,sales_category,profit_margin,sales_rank,rolling_profit_average,z-score,currency,average_indicator
0,ID_2,Name_8,East,Arun,Clothing,0.762919,2024-08-10,42.0,1.0,2022-01-02,...,0.244462,0.476602,6930.53,Medium,48.884717,2.0,,0.587069,297.0,Above Average
1,ID_5,Name_18,West,Product_8,Food,0.712614,2024-11-26,28.0,0.268817,2022-01-04,...,0.40893,0.460069,13426.93,Medium,73.418817,4.0,,0.434,524.0,Above Average
2,ID_6,Name_16,North,senthalampoo,Food,0.075173,2024-12-02,42.0,0.172043,2022-01-05,...,0.390509,0.147329,14422.29,Low,2.716605,5.0,0.512865,-1.505593,795.0,Below Average
3,ID_7,Name_15,East,Aran,Food,0.545764,2024-05-06,64.0,0.11828,2022-01-07,...,0.0,0.674689,19478.8,Medium,24.53016,5.0,0.391797,-0.073686,65.0,Below Average
4,ID_8,Name_9,South,rama,Clothing,0.177651,2024-09-04,60.0,0.215054,2022-01-07,...,0.235044,0.500523,21358.54,Low,22.353091,3.0,0.146606,-1.193772,257.0,Below Average


## 1.Write a script to impute missing values in a DataFrame using KNN

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

# Seperate the datetime columns
date_time_features = df.select_dtypes(include=['datetime64']).columns
numerical_features = df.select_dtypes(include=['number']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Encoding categorical variables
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Exclude datetime columns before imputation
df_numeric = df[numerical_features.union(categorical_features)] # keep only numeric and categorical variables

# Apply KNN imputer
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

# Decode categorical variables
for colu in label_encoders:
    df_imputed[colu] = df_imputed[colu].round().astype(int)
    df_imputed[colu] = label_encoders[colu].inverse_transform(df_imputed[colu])

print(df_imputed)

   average_indicator    bag_id  board_id   book_id     category  cloth_id  \
0      Above Average  0.670603  0.530714  0.668087     Clothing  0.631579   
1      Above Average  0.449100  0.590280  0.880026         Food  0.293912   
2      Below Average  0.421383  0.714948  0.699477         Food  0.226997   
3      Below Average  0.163749  0.615874  0.656665         Food  0.973223   
4      Below Average  0.695031  1.000000  0.650894     Clothing  0.000000   
5      Above Average  0.561254  0.030094  0.947255  Accessories  0.024710   
6      Above Average  0.466934  0.186578  0.378485    Furniture  0.562580   
7      Below Average  0.368001  0.625733  0.632913  Accessories  0.449343   
8      Below Average  0.331098  0.199359  0.885357  Electronics  0.330956   
9      Below Average  0.844403  0.924956  0.245461         Food  1.000000   
10     Above Average  0.215057  0.287680  0.178963    Furniture  0.536992   
11     Above Average  0.707816  0.071057  0.699959  Accessories  0.235508   

## 2.Create a DataFrame with hierarchical indexes based on region and category and calculate group statistics

In [8]:
# Since we have object and datetime DataType we need to filter it before we do aggregation
dropped_feature = df.drop(columns=['order_date', 'col_62', 'sales_category', 'rolling_profit_average'], inplace=True)

# We first create hierarchical index based on region and category column.
hierarchial_index = df.set_index(['region', 'category'], inplace=True)
hierarchial_df = pd.DataFrame(data=df, index=hierarchial_index, columns=df.columns)
hierarchial_df.groupby(['region', 'category']).agg(['sum', 'mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,id,id,name,name,name,product,product,product,total_sales,...,sales_rank,z-score,z-score,z-score,currency,currency,currency,average_indicator,average_indicator,average_indicator
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,median,sum,mean,median,sum,mean,median,sum,...,median,sum,mean,median,sum,mean,median,sum,mean,median
region,category,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,1,6,6.0,6.0,16,16.0,16.0,1,1.0,1.0,0.762919,...,2.0,0.587069,0.587069,0.587069,297.0,297.0,297.0,0,0.0,0.0
0,3,35,17.5,17.5,15,7.5,7.5,12,6.0,6.0,1.506824,...,3.0,1.116281,0.558141,0.558141,846.0,423.0,423.0,2,1.0,1.0
0,4,23,7.666667,10.0,31,10.333333,10.0,20,6.666667,8.0,1.277046,...,4.0,-1.31721,-0.43907,0.086008,1583.0,527.666667,515.0,2,0.666667,1.0
1,0,21,21.0,21.0,14,14.0,14.0,5,5.0,5.0,0.643033,...,3.0,0.222282,0.222282,0.222282,486.0,486.0,486.0,0,0.0,0.0
1,2,11,11.0,11.0,13,13.0,13.0,3,3.0,3.0,0.782795,...,2.0,0.647548,0.647548,0.647548,159.0,159.0,159.0,0,0.0,0.0
1,3,21,10.5,10.5,13,6.5,6.5,26,13.0,13.0,0.567046,...,4.5,-1.743256,-0.871628,-0.871628,916.0,458.0,458.0,2,1.0,1.0
1,4,4,4.0,4.0,6,6.0,6.0,14,14.0,14.0,1.0,...,1.0,1.308454,1.308454,1.308454,353.0,353.0,353.0,0,0.0,0.0
2,1,35,17.5,17.5,18,9.0,9.0,18,9.0,9.0,1.148614,...,2.0,0.026327,0.013163,0.013163,443.0,221.5,221.5,1,0.5,0.5
2,2,2,2.0,2.0,15,15.0,15.0,13,13.0,13.0,0.0,...,4.0,-1.734326,-1.734326,-1.734326,172.0,172.0,172.0,1,1.0,1.0
2,3,7,7.0,7.0,7,7.0,7.0,4,4.0,4.0,0.787603,...,2.0,0.662177,0.662177,0.662177,154.0,154.0,154.0,1,1.0,1.0


## 3.Write a function to detect and handle duplicate rows based on a fuzzy match

In [10]:
def detect_duplicates(df):
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"There were {duplicates} duplicates in the entire DataFrame")
    else:
        print('There are no duplicates in a dataframe')

In [11]:
detect_duplicates(df)

There are no duplicates in a dataframe


In [12]:
# Since we don't have any duplicates in 'df' DataFrame, we create a new with 9 duplicates
# Creating a base DataFrame with 21 unique rows
np.random.seed(42)  # For reproducibility

values = {
    'ID': np.arange(1, 22),  # Unique IDs from 1 to 21
    'Salary': np.random.uniform(50000, 120000, 21).round(2),  # Random float salaries
    'Department': np.random.choice(['HR', 'IT', 'Finance', 'Sales', 'Marketing'], 21),  # Categorical
    'Joining_Date': pd.date_range(start='2015-01-01', periods=21, freq='M'),  # Datetime
    'Active': np.random.choice([True, False], 21),  # Boolean
    'City': np.random.choice(['New York', 'San Francisco', 'Chicago', 'Los Angeles', 'Austin'], 21)  # String
}

temp_data = pd.DataFrame(values)

# Creating 9 duplicate rows from the existing DataFrame
duplicates = temp_data.sample(n=9, random_state=42)  # Select 9 random rows to duplicate

# Append duplicate rows to the original DataFrame
concat_data = pd.concat([temp_data, duplicates], ignore_index=True)

# Display the DataFrame
print(concat_data)


    ID     Salary Department Joining_Date  Active           City
0    1   76217.81         IT   2015-01-31   False  San Francisco
1    2  116550.00      Sales   2015-02-28   False    Los Angeles
2    3  101239.58      Sales   2015-03-31   False         Austin
3    4   91906.09    Finance   2015-04-30    True  San Francisco
4    5   60921.30      Sales   2015-05-31   False  San Francisco
5    6   60919.62      Sales   2015-06-30    True    Los Angeles
6    7   54065.85         HR   2015-07-31   False  San Francisco
7    8  110632.33    Finance   2015-08-31    True  San Francisco
8    9   92078.05  Marketing   2015-09-30   False    Los Angeles
9   10   99565.08    Finance   2015-10-31    True    Los Angeles
10  11   51440.91  Marketing   2015-11-30    True       New York
11  12  117893.69         HR   2015-12-31   False         Austin
12  13  108270.98         IT   2016-01-31    True         Austin
13  14   64863.74      Sales   2016-02-29   False  San Francisco
14  15   62727.75        

  'Joining_Date': pd.date_range(start='2015-01-01', periods=21, freq='M'),  # Datetime


In [25]:
# Now let's start build a function to find the duplicates based on fuzzy match 
# 'thefuzz' is a library used to detect and handle duplicate rows based on a fuzzy string match
# We use 'fuzz' because it contains contains different similarity scoring functions
# We use 'process' because it provides utility functions for comparing a string against a list
from thefuzz import fuzz, process

# We build a function and pass df, column & threshold
# Threshold Setting: Define a similarity threshold (e.g., 90%) to determine duplicates.

def detect_fuzzy_duplicates(concat_data, column_name, threshold=90):
# Let's copy the dataframe to a variable so that the original dataframe is not being disturbed
    data = concat_data.copy()
    
# Let's initialize few helper variables
# Adding a placeholder column
    data['duplicate_column'] = -1 # Placeholder for grouping similar records
    checked_indices = set() # A set to keep track of rows that have already been matched
    group_id = 0 # A counter to assign group numbers to similar entries

# Iterates over each row in the specified column
    for i, text in enumerate(data[column_name]):
# i is the row index, and text is the value from the column
        if i in checked_indices:
            continue
# process.extract() compares text with all other values in the column.
# fuzz.token_sort_ratio calculates similarity, ignoring word order.
# limit=len(data): Ensures that all rows are compared.
        
        matches = process.extract(text, data[column_name], scorer=fuzz.token_sort_ratio, limit=len(data))

# match_text: The matched string.
# score: The similarity percentage (0 to 100).
# match_index: The row index of the matched string.
        for match_text, score, match_index in matches:
            if score >= threshold and match_index not in checked_indices:
# Assigns the current group_id to the Duplicate_Group column.
                data.at[match_index, 'duplicate_column'] = group_id
# Adds the matched index to checked_indices to avoid reprocessing.
                checked_indices.add(match_index)
# After processing a record and its matches, increments the group_id to start a new group for the next distinct set of matches.
        group_id += 1
# Returns the DataFrame with an additional Duplicate_Group column.
    return data

In [14]:
fuzzy_duplicates = detect_fuzzy_duplicates(concat_data, 'City')
print(fuzzy_duplicates)

    ID     Salary Department Joining_Date  Active           City  \
0    1   76217.81         IT   2015-01-31   False  San Francisco   
1    2  116550.00      Sales   2015-02-28   False    Los Angeles   
2    3  101239.58      Sales   2015-03-31   False         Austin   
3    4   91906.09    Finance   2015-04-30    True  San Francisco   
4    5   60921.30      Sales   2015-05-31   False  San Francisco   
5    6   60919.62      Sales   2015-06-30    True    Los Angeles   
6    7   54065.85         HR   2015-07-31   False  San Francisco   
7    8  110632.33    Finance   2015-08-31    True  San Francisco   
8    9   92078.05  Marketing   2015-09-30   False    Los Angeles   
9   10   99565.08    Finance   2015-10-31    True    Los Angeles   
10  11   51440.91  Marketing   2015-11-30    True       New York   
11  12  117893.69         HR   2015-12-31   False         Austin   
12  13  108270.98         IT   2016-01-31    True         Austin   
13  14   64863.74      Sales   2016-02-29   Fals