In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import os 
import vaex as vx

missingData="KDD/Incomplete Dataset/KDD_AE_1.csv"
originalData="KDD/KDD.csv"

In [2]:
# Getting the current working directory
workingDirectory=os.getcwd()

In [3]:
# Creating the path for output file
outputFileName=missingData.split('/')[-1]
imputedFile=workingDirectory+'\\'+outputFileName

## Calculating the Probality density of the sample

In [4]:
def normalProbablityDensity(x,mu,sigma):
    deno=sigma*np.sqrt(2*np.pi)
    try:
        num=np.exp((-1/2)*((x-mu)/sigma)**2)
        pdf=num/deno
    except ZeroDivisionError:
        pdf=0
    return pdf

## Accepting and Rejecting the Sample

In [5]:
##Desiding whether to accept the sample or reject
def isAccept(x,prevX,mu,sigma):
    newSample=normalProbablityDensity(x,mu,sigma)
    previousSample=normalProbablityDensity(prevX,mu,sigma)
    try:
        p=newSample/previousSample
    except ZeroDivisionError:
        p=0
    U=np.random.uniform(0,1)
    acceptance=min(1,p)
    if acceptance>U:
        return True
    else:
        return False

In [6]:
def getMean(column):
    return pd.Series(column).mean()

In [7]:
def getCategoricalAndNumericalColumnList(df):
#     Creating a dataframe which contains only categorical values
# Initialise dataframe
    catagoricalDataFrame=pd.DataFrame()
    numericalDataFrame=pd.DataFrame()
    for index,column in enumerate(df.columns):
        if df[column].dtypes==object:
            catagoricalDataFrame[index]=df[column]
        else:
            numericalDataFrame[index]=df[column]
    return catagoricalDataFrame,numericalDataFrame

In [8]:
def imputeCategoricalColumns(df):
    for index,column in enumerate(df.columns):
        if df[column].dtype==object:
            mode=df[column].mode()
            df[column]=df[column].fillna(mode[0])
    return df

In [9]:
def calculateAE(catDataFrmOriginalSet,catDataFrmImputedSet):
    comparedCatDF=(catDataFrmOriginalSet==catDataFrmImputedSet).replace({True:1,False:0})
    sumOfAllValues=comparedCatDF.values.sum()
    nValues=comparedCatDF.count().sum()
    sumOfAllValues,nValues
    AE=round(sumOfAllValues/nValues,4)
    return AE

In [10]:
def calculateNrms(estimate,original):
    numeratorDF=estimate-original
    numeratorDF=numeratorDF**2
    denomintorDF=original**2
    numerator=math.sqrt(numeratorDF.sum())
    denominator=math.sqrt(denomintorDF.sum())
    nrms=numerator/denominator
    return nrms

In [11]:
# Calculate Intial Posterior distribution parameters
def calculatePosterior(missingValueDf):
    paramtersMap={}
    samples={}
    for index,column in enumerate(missingValueDf.columns):
        if missingValueDf[column].dtypes==object:
            continue
        mean=missingValueDf[column].mean()
        std=missingValueDf[column].std()
        maxValue=missingValueDf[column].min()
        minValue=missingValueDf[column].max()
        minmax=[minValue,maxValue]
        pair=[mean,std,minmax]
        paramtersMap[column]=pair
        samples[column]=[mean]
    return paramtersMap,samples

In [12]:
def isDataSetContainingCategoricalValues(df):
    isContainingCategorical=False
    for index in missingValueDf.columns:
        if missingValueDf[index].dtype==object:
            isContainingCategorical=True
            return True
    return False    

In [13]:
def saveDataNoCat(df,file):
    df.to_csv(file,index=False)

In [14]:
def saveDataWithCat(df,catDf,file):
    for index,column in enumerate(catDf.columns):
        df[column]=catDf[column]
    df.to_csv(file,index=False)

### Markov chain and Monte Carlo Simulation

In [15]:
def MCMCSampling(missingValueDf,numberOfSamples,paramtersMap,samples):
    for i in range(numberOfSamples):
        for index,column in enumerate(missingValueDf.columns):
            columnValues=missingValueDf[column].to_list()
            if missingValueDf[column].dtypes==object:
                continue
            else:
                pbDistribution=paramtersMap[column]
                mean=pbDistribution[0]
                std=pbDistribution[1]
                currentSample=samples[column][-1]
                drawRandomValue=np.random.normal(currentSample,std)
                isAccepeted=isAccept(drawRandomValue,currentSample,mean,std)
                if isAccepeted:
                    randomSample=drawRandomValue
                    samples[column].append(randomSample)
                else:
                    continue
    print("Simulation Done") 
    return paramtersMap,samples

## Final Imputation

In [16]:
def finalImputation(originalDataWithMissingValues,samples):
    for index,column in enumerate(originalDataWithMissingValues.columns):
        if originalDataWithMissingValues[column].dtypes==object:
            continue
        totalSamples=pd.Series(samples[column][200:])
        if pd.isna(totalSamples.mean()):
            mu=0
        else:
            mu=round(totalSamples.mean(),1)
            if originalDataWithMissingValues[column].dtypes=="int64":
                mu=round(mu)
        colValue=originalDataWithMissingValues[column].to_list()
        for i in range(len(colValue)):
            isNan=pd.isna(colValue[i])
            if isNan:
                colValue[i]=mu
            else:
                continue
        originalDataWithMissingValues[column]=pd.Series(colValue)    
    return originalDataWithMissingValues

## Main script

In [17]:
missingValVx=vx.from_csv(missingData,convert=True,header=None)     
missingValueDf=missingValVx.to_pandas_df(index_name=None)  
print('done')
# missingValueDf=pd.read_excel(missingData,header=None)
datawithMissingValue=missingValueDf.copy(deep=True)
originalDataWithMissingValues=missingValueDf.copy(deep=True)

isContainingCategorical=isDataSetContainingCategoricalValues(missingValueDf)        
paramtersMap,samples=calculatePosterior(missingValueDf)     

numberOfSamples=100
paramtersMap,samples=MCMCSampling(missingValueDf,
                                  numberOfSamples,paramtersMap,samples)
originalDataWithMissingValues=finalImputation(originalDataWithMissingValues,samples)
oriDF=vx.from_csv(originalData,convert=True,header=None)
original=oriDF.to_pandas_df(index_name=None)
if isContainingCategorical:
    imputedCatDF,imputedNumbericalDF=getCategoricalAndNumericalColumnList(originalDataWithMissingValues)
    catDataFrmOriginalSet,originalNumbericalDF=getCategoricalAndNumericalColumnList(original)
    catDataFrmImputedSet=pd.DataFrame(imputeCategoricalColumns(imputedCatDF))
else:
    imputedNumbericalDF=originalDataWithMissingValues
    originalNumbericalDF=original    
    
if isContainingCategorical:
    AE=calculateAE(catDataFrmOriginalSet,catDataFrmImputedSet)
    print(f'The AE value for the given data set is {AE}')
    saveDataWithCat(originalDataWithMissingValues,catDataFrmImputedSet,imputedFile)
else:
    print(f'There\'s no catagorical data present in the dataset') 
    saveDataNoCat(originalDataWithMissingValues,imputedFile)

original=originalNumbericalDF.values
imputed=imputedNumbericalDF.values
nrms=round(calculateNrms(imputed,original),2)
print(f'The NRMS value for the given data set is {nrms}')
print('The Output file is generate in the following path')
print(imputedFile)

done
Simulation Done


  totalSamples=pd.Series(samples[column][200:])


2 done
3 done
The AE value for the given data set is 0.9968
The NRMS value for the given data set is 0.0
The Output file is generate in the following path
C:\Users\abdul\KDD_AE_1.csv
