## Remove duplicates

In [1]:
from pandas import read_csv


In [6]:
# now load the dataset
data_frame = read_csv("IRIS.csv", header=None)

In [7]:
print(data_frame.shape)

(151, 5)


In [8]:
# calculate the duplicates present
duplicates = data_frame.duplicated()

In [9]:
# output the duplicates if there are any duplicates
print(duplicates.any())
# list all duplicate rows
print(data_frame[duplicates])

True
       0    1    2    3               4
35   4.9  3.1  1.5  0.1     Iris-setosa
38   4.9  3.1  1.5  0.1     Iris-setosa
143  5.8  2.7  5.1  1.9  Iris-virginica


In [10]:
data_frame.drop_duplicates(inplace=True)
print(data_frame.shape)

(148, 5)


## Impute the missing values

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values=np.nan, strategy='mean')
impute.fit([[2, 5], [np.nan, 8], [4, 6]])
SimpleImputer()
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(impute.transform(X))

[[3.         2.        ]
 [6.         6.33333333]
 [7.         6.        ]]


In [None]:
In case of sparse metrices too, SimpleImputer works

In [2]:
import scipy.sparse as sp
matrix = sp.csc_matrix([[2, 4], [0, -2], [6, 2]])
impute = SimpleImputer(missing_values=-1, strategy='mean')
impute.fit(matrix)
SimpleImputer(missing_values=-1)
matrix_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]])
print(impute.transform(matrix_test).toarray())

[[2.66666667 2.        ]
 [6.         1.33333333]
 [7.         6.        ]]


In [4]:
import pandas as pd
data_frame = pd.DataFrame([["New York", "New Delhi"],[np.nan, "Tokyo"],["New York", np.nan],["New York", "Tokyo"]], dtype="category")

impute = SimpleImputer(strategy="most_frequent")
print(impute.fit_transform(data_frame))


[['New York' 'New Delhi']
 ['New York' 'Tokyo']
 ['New York' 'Tokyo']
 ['New York' 'Tokyo']]


## Impute missing values using machine learning

In [57]:
import numpy as np
import pandas as pd

In [62]:
missing_dictionary = {'Variable_A': [200, 190, 90, 149, np.nan],
                     'Variable_B': [400, np.nan, 149, 200, 205],
                     'Variable_C': [200,149, np.nan, 155, 165],
                     'Variable_D': [200, np.nan, 90, 149,100],
                     'Variable_E': [200, 190, 90, 149, np.nan],}

In [63]:
missing_df = pd.DataFrame(missing_dictionary)

In [64]:
missing_df

Unnamed: 0,Variable_A,Variable_B,Variable_C,Variable_D,Variable_E
0,200.0,400.0,200.0,200.0,200.0
1,190.0,,149.0,,190.0
2,90.0,149.0,,90.0,90.0
3,149.0,200.0,155.0,149.0,149.0
4,,205.0,165.0,100.0,


In [67]:
from sklearn.impute import KNNImputer

In [69]:
missing_imputer = KNNImputer(n_neighbors=2)

In [70]:
imputed_df = missing_imputer.fit_transform(missing_df)

In [71]:
imputed_df

array([[200. , 400. , 200. , 200. , 200. ],
       [190. , 302.5, 149. , 150. , 190. ],
       [ 90. , 149. , 160. ,  90. ,  90. ],
       [149. , 200. , 155. , 149. , 149. ],
       [169.5, 205. , 165. , 100. , 169.5]])

## Data Imbalance using SMOTE

In [83]:
import pandas as pd
from imblearn.over_sampling import SMOTE

from imblearn.combine import  SMOTETomek

In [111]:
# Import data and create X, y
credit_card_data_set = pd.read_csv('creditcard.csv')


In [127]:
X = credit_card_data_set.iloc[:,:-1]
y = credit_card_data_set.iloc[:,-1].map({1:'Fraud', 0:'No Fraud'})

# Resample data
X_resampled, y_resampled = SMOTE(sampling_strategy={"Fraud":500}).fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)

In [120]:
class_0_original = len(credit_card_data_set[credit_card_data_set.Class==0])

In [121]:
class_1_original = len(credit_card_data_set[credit_card_data_set.Class==1])

In [123]:
print(class_1_original/(class_0_original+class_1_original))

0.001727485630620034


In [128]:
sampled_0 = len(y_sampled[y_sampled==0])


In [129]:
sampled_1 = len(y_sampled[y_sampled==1])

In [130]:
print(sampled_1/(sampled_0+sampled_1))

0.5
