# Cleaning - missing and duplicate data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Generate a t-shirt order
have a name, a t-shirt size, a t-shirt color and a weight(in pounds)<br>
Uses the <a href="https://pypi.org/project/names/https://pypi.org/project/names/">names </a> module to generate random names

In [None]:
import utils as ut
df = ut.generate_tshirt_order()
df

### Make some duplicates

In [None]:
# make some duplicates just to show how to handle duplicates (delete them)
#lets find the oldest male and female
def fun1(df,numb=5):
    #generates numb rows from df
    return (df.iloc[0:numb,:])


#generates numb rows from each group to be used as duplicates
df_dups=df.groupby('t_shirt_size').apply(fun1)
df_dups

### Append to original dataframe

In [None]:
df=pd.concat([df,df_dups],ignore_index=True)
#the old, soon to be deprecated way
# df=df.append(df_dups, ignore_index=True)
len(df)

In [None]:
df.name.duplicated().sum()

### randomly select 20% of rows for t_shirt_size ommision

In [None]:
#first save orig size for later comparison
df['t_shirt_size_orig'] = df['t_shirt_size']

In [None]:
import random
res = random.sample(range(0, len(df)), int(0.2 * len(df)))
print(f'Number of rows to have "t_shirt_size" set to np.Nan is {len(res)}' )

In [None]:
#first, save orig size
# df['t_shirt_size_orig'] = df.loc[res,'t_shirt_size']

#then lose orig size
df.loc[res,'t_shirt_size']=np.NaN

In [None]:
#how many total
#df.isna().sum().sum()

#how many are null?
df.t_shirt_size.isna().sum()

In [None]:
#show the missing data
df[df.t_shirt_size.isna()].head()

In [None]:
df.head()

### Lets take a look at the distribution

#### kind ='hist', hist will bin the number of weights and display them, hue will determine which color group they belong to

In [None]:
sns.displot(data=df, x="weight",  hue="t_shirt_size", kind='hist', fill=True);

#### kind ='kde', kde is a kernel density estimater, essentially calculates a gaussian distribution around each point, and then adds these distributions, and then divides by the number of points to get the smooth curves you see that have an area of 1.  I'm showing it because its easier to see the distributions

In [None]:
sns.displot(data=df, x="weight",  hue="t_shirt_size", kind='kde', fill=True);

df.t_shirt_size.value_counts()## What to do about duplicates?  Delete them!

### Find them first
start here 1/24/24

In [None]:
df.duplicated(keep=False).sum()

### Visually Verify 

In [None]:
df[df.duplicated(keep=False)].sort_values(by='name')

### Now drop the regular duplicates that are not missing data

In [None]:
df.drop( df[df.duplicated(keep=False)].index, inplace=True)

## What if one of the duplicated rows is missing the t-shirt size?  Then duplicated() will not find it.  Maybe we should check for duplicates in the 'name' column instead

In [None]:
df.name.duplicated(keep=False).sum()

### Verify that they are duplicates first

In [None]:
df[df.name.duplicated(keep=False)].sort_values(by='name')

### We want to delete the one that has a np.nan for t_shirt_size.

In [None]:
# to see the indexis that will be dropped
# df[df.name.duplicated(keep=False) & (df.t_shirt_size.isna())].index

In [None]:
df.drop( df[df.name.duplicated(keep=False) & (df.t_shirt_size.isna())].index, inplace=True)

## Duplicates are gone, now how to impute the missing fields

In [None]:
len(df[(df['t_shirt_size'].isnull())])

In [None]:
# how many match
def printstats(df):
    numbmatches=(df['t_shirt_size_orig']==df['t_shirt_size']).sum()
    print(f'{numbmatches} tshirt sizes are correct out of {len(df)} total')

### One way is to use SimpleImputer and assign the median value to all the missing values


In [None]:
df_med=df.copy()
df_med

In [None]:
df[df.t_shirt_size.isnull()]

In [None]:
df.t_shirt_size.value_counts(dropna=False)

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent',add_indicator=True)  #works with strings
imp = imp.fit(df_med[['t_shirt_size']])   #here is where it determines what the most frequent is
df_med['t_shirt_size']=imp.transform(df_med[['t_shirt_size']])[:,0] #here is where the transform is applied 
# imp.transform(df_med[['t_shirt_size']])[:,0]

In [None]:
df_med

In [None]:
printstats(df_med)

### Another way is to find the mean weight for each t-shirt size, and then assign missing value t-shirt size based on weight
For each NaN, assign t-shirt size to closest mean

#### First calculate average weight for each t-shirt size

In [None]:
df_better = df.copy()

In [None]:
avgs = df_better.groupby('t_shirt_size').weight.mean()
avgs

In [None]:
#how many in each group
df_better.groupby('t_shirt_size').count()

#### Impute the value, replace any NaNs, and add a 1 in a column to indicate that this value was imputed 
<mark>The indicater column will inform a ML algorithm that this value was imputed

In [None]:
#map works on a column apply works on a row, which means we have access tothe entire row
#can also return more than 1 value so that we can have an indicator value

def func(row):
    if row.t_shirt_size is np.NaN:
        #which avgs.weight is this weight closest to?
        
        #get a list of differences between this weight and average weights
        lst_vals = [abs(row.weight-val) for val in avgs]

        #get the index of the minimum value
        min_val = min(lst_vals)
        min_index=lst_vals.index(min_val)

        #return t_shirt_size corresponding to this index
        return pd.Series([avgs.index[min_index],True],index=['t_shirt_size','t_shirt_size_indicator'])
    #its not missing, return what's there
    return pd.Series([row.t_shirt_size,False],index=['t_shirt_size','t_shirt_size_indicator'])
# df_better['t_shirt_size_indicator']=False
df_better[['t_shirt_size','t_shirt_size_indicator']]=df_better.apply(func, axis=1)

In [None]:
printstats(df_better)

In [None]:
#see which ones it got wrong, look at the distributions in above plots
#it got them wrong because the weights were outliers
df_better[(df_better['t_shirt_size_indicator'])]

# Generate Slide for the lecture

In [None]:
def func(df, numb=5):
    return (df.iloc[0:numb,:])
    
df.groupby('t_shirt_size', dropna=False).apply(func,numb=3)