In [1]:
import time
import pandas as pd
import numpy as np
import multiprocessing as mp

import datetime as dt
from sklearn import preprocessing

In [2]:
from fcleaning import (save_to_csv,
                       reduce_mem_usage,
                       EmptyElim,
                       OutlDetect,                       
)
from fencoding import (FEncoding,
                       FImputation
                       )

In [3]:
X = pd.DataFrame(np.random.random((1000,1000)))
X = reduce_mem_usage(X)


 Memory usage of dataframe is 7.63 MB

 Memory usage after optimization is: 1.91 MB

 Memory usage decreased by 75.0%


In [4]:
X = pd.DataFrame([[1,2, np.nan, np.nan, None,   None], 
                  [3,4, np.nan, np.nan, None,   None], 
                  [5,6, 7,      np.nan, np.nan, None]])
X

Unnamed: 0,0,1,2,3,4,5
0,1,2,,,,
1,3,4,,,,
2,5,6,7.0,,,


In [5]:
emptyelim = EmptyElim(n_jobs=-1, chunks = None)

# emptyelim.fit(X)
# XX = emptyelim.transform(X)
# XX

XX = emptyelim.fit_transform(X)
XX


 col_names: {3: [nan], 4: [nan], 5: [None]}


Unnamed: 0,0,1,2
0,1,2,
1,3,4,
2,5,6,7.0


In [6]:
save_to_csv(XX, rest_columns=None, path=None)


 Successfully saved to trial_10272020-09:46.csv


In [7]:
pd.read_csv('trial_10272020-07:25.csv')

Unnamed: 0,0,1,2,3
0,1.0,2.0,,14.75
1,3.0,4.0,,10.0
2,,,2.0,4.75
3,4.75,2.0,,8.0
4,0.1,3.0,2.0,
5,1.0,1.0,2.0,11.0
6,1.0,1.0,2.0,11.0


In [8]:
X=pd.DataFrame([[1,2, np.nan, 1000], [3,4, np.nan, 10], [np.nan, np.nan, 2, 1], [10000, 2, np.nan, 8], [0.1, 3,2], [1,1,2, 11], [1,1,2, 11]])
X

Unnamed: 0,0,1,2,3
0,1.0,2.0,,1000.0
1,3.0,4.0,,10.0
2,,,2.0,1.0
3,10000.0,2.0,,8.0
4,0.1,3.0,2.0,
5,1.0,1.0,2.0,11.0
6,1.0,1.0,2.0,11.0


In [9]:
outldetect = OutlDetect(outliers_detection_technique='iqr_proximity_rule', n_jobs = -1, 
                 chunks = None)
# outldetect.fit(X)
# XX = outldetect.transform(X)
# XX

XX = outldetect.fit_transform(X)
save_to_csv(XX, rest_columns=None, path=None)

XX


 col_outl_info (upper, lower) bounds: {0: (-1.25, 4.75), 1: (-1.0, 5.0), 2: (2.0, 2.0), 3: (4.75, 14.75)}

 Successfully saved to trial_10272020-09:46.csv


Unnamed: 0,0,1,2,3
0,1.0,2.0,,14.75
1,3.0,4.0,,10.0
2,,,2.0,4.75
3,4.75,2.0,,8.0
4,0.1,3.0,2.0,
5,1.0,1.0,2.0,11.0
6,1.0,1.0,2.0,11.0


In [10]:
X = pd.DataFrame([
              ['a', 1, 1.5], 
              [None, None, None], 
              ['c', 2, 2.5], 
              [np.nan, np.nan, np.nan]
              ])
X

Unnamed: 0,0,1,2
0,a,1.0,1.5
1,,,
2,c,2.0,2.5
3,,,


In [11]:
fencoding = FEncoding(n_jobs=-1)
fencoding.initialize_types(X)


 1 has type float64 and number of unique values: 3, will be considered as a categorical 



{'categor_columns': [0, 1], 'numer_columns': [2], 'time_columns': []}

In [12]:
XX = fencoding.bucket_numerical(X, n_bins=5, columns_to_buck = 'all_numerical', 
                         drop_current = False)
XX


 1 has type float64 and number of unique values: 3, will be considered as a categorical 


 2 bucketing ...

 2 has keypoints: [nan nan nan nan nan] , and can not be bucketed.


Unnamed: 0,0,1,2
0,a,1.0,1.5
1,,,
2,c,2.0,2.5
3,,,


In [13]:
XX = fencoding.encode_categor(X, method = 'OrdinalEncoder')
XX


 1 has type float64 and number of unique values: 3, will be considered as a categorical 



Unnamed: 0,2,0,1
0,1.5,0.0,0.0
1,,,
2,2.5,1.0,1.0
3,,,


In [14]:
XX = fencoding.encode_categor(X, method = 'OneHotEncoder')
XX


 1 has type float64 and number of unique values: 3, will be considered as a categorical 



Unnamed: 0,2,0_a,0_c,1_1.0,1_2.0
0,1.5,1,0,1,0
1,,0,0,0,0
2,2.5,0,1,0,1
3,,0,0,0,0


In [15]:
X = pd.DataFrame([
'Jan 19, 1990',
'January 19, 1990',
'Jan 19,1990',
'01/19/1990',
'01/19/90',
'1990',
'Jan 1990',
'01.02.2000',
'2000.02.01',
'01-02-2000',
'2111-01-01 12:48:20',
'123',
'abs 123', 
1339521878.04,
'1339521878.04'
]).T
X = X.append(X)
X
fencoding = FEncoding(n_jobs=1)

In [16]:
fencoding.initialize_types(X)

{'categor_columns': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
 'numer_columns': [],
 'time_columns': []}

In [17]:
XX = fencoding.date_replace(X)
XX


 time_columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1990-01-19,1990-01-19,1990-01-19,1990-01-19,1990-01-19,1990-01-01,1990-01-01,2000-02-01,2000-02-01,2000-02-01,2111-01-01,123,abs 123,2012-06-12,2012-06-12
0,1990-01-19,1990-01-19,1990-01-19,1990-01-19,1990-01-19,1990-01-01,1990-01-01,2000-02-01,2000-02-01,2000-02-01,2111-01-01,123,abs 123,2012-06-12,2012-06-12


In [18]:
fencoding.initialize_types(XX)

{'categor_columns': [11, 12],
 'numer_columns': [],
 'time_columns': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14]}

In [19]:
fencoding.encode_time(XX, drop_current = True)


 0 was encoded from date

 1 was encoded from date

 2 was encoded from date

 3 was encoded from date

 4 was encoded from date

 5 was encoded from date

 6 was encoded from date

 7 was encoded from date

 8 was encoded from date

 9 was encoded from date

 10 was encoded from date

 13 was encoded from date

 14 was encoded from date


Unnamed: 0,11,12,0_year,0_month,0_day,1_year,1_month,1_day,2_year,2_month,2_day,3_year,3_month,3_day,4_year,4_month,4_day,5_year,5_month,5_day,6_year,6_month,6_day,7_year,7_month,7_day,8_year,8_month,8_day,9_year,9_month,9_day,10_year,10_month,10_day,13_year,13_month,13_day,14_year,14_month,14_day
0,123,abs 123,1990,1,19,1990,1,19,1990,1,19,1990,1,19,1990,1,19,1990,1,1,1990,1,1,2000,2,1,2000,2,1,2000,2,1,2111,1,1,2012,6,12,2012,6,12
0,123,abs 123,1990,1,19,1990,1,19,1990,1,19,1990,1,19,1990,1,19,1990,1,1,1990,1,1,2000,2,1,2000,2,1,2000,2,1,2111,1,1,2012,6,12,2012,6,12


In [20]:
X = pd.DataFrame([
              [1, 1.5], 
              [np.nan, np.nan], 
              [2, 2.5], 
              [ np.nan, 1]
              ])
X

Unnamed: 0,0,1
0,1.0,1.5
1,,
2,2.0,2.5
3,,1.0


In [21]:
fimputation = FImputation('regression-based', 
                          fill_with_value = None, 
                          n_jobs = -1,
                          )
XX = fimputation.impute(X)


 0 has type float64 and number of unique values: 3, will be considered as a categorical 



In [22]:
XX

Unnamed: 0,1,0
0,1.5,0.0
1,1.666667,0.0
2,2.5,1.0
3,1.0,0.0


In [23]:
fimputation = FImputation('tree-based', 
                          fill_with_value = 'extreme_values', 
                          n_jobs = -1,
                          )
XX = fimputation.impute(X)


 0 has type float64 and number of unique values: 3, will be considered as a categorical 



In [24]:
XX

Unnamed: 0,1,0
0,1.5,0.0
1,2.5,1.0
2,2.5,1.0
3,1.0,1.0


In [25]:
fimputation = FImputation('tree-based', 
                          fill_with_value = 'zeros', 
                          n_jobs = -1,
                          )
XX = fimputation.impute(X)


 0 has type float64 and number of unique values: 3, will be considered as a categorical 



In [26]:
XX

Unnamed: 0,1,0
0,1.5,0.0
1,0.0,0.0
2,2.5,1.0
3,1.0,0.0
