#Main Fundermentle data pre-processing pipeline

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime,timedelta
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import  Pipeline
from sklearn.pipeline import FeatureUnion

In [3]:
daily_df = pd.read_csv('../../../data/main_data/DateDistric.csv')
info_df = pd.read_csv('../../../data/main_data/Info.csv')
district_dt_df = pd.read_csv('../../../data/geo_mode_data/test.csv')
police_df = pd.read_csv('../../../data/main_data/police_report.csv')

police_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5800 entries, 0 to 5799
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         5800 non-null   int64  
 1   formal_date   5800 non-null   object 
 2   locked_percn  5800 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 136.1+ KB


In [4]:
class DateEncoder(BaseEstimator,TransformerMixin):

    def __init__(self,col_name='Date',year='2020'):
        self.col_name = col_name
        self.year = year

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        x = X.copy()
        x['formal_date']= x.loc[:,'Date'].apply(lambda x: self.date_converter(x))
        x['quarter'] = x['formal_date'].dt.quarter
        x['date'] = x['formal_date'].dt.day
        x['day_of_week'] = x['formal_date'].dt.dayofweek
        x['month'] = x['formal_date'].dt.month
        return x


    def date_converter(self,x,year='2020'):
        dt = str(x).split('-')
        date = year+'/'+dt[0]+'/'+dt[1]
        date_obj = datetime.strptime(date,'%Y/%m/%d')
        return date_obj

In [5]:
class DatasetTuner(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        self.col_order = ['index','formal_date','quarter','date','day_of_week','month','Suspected_Local','Suspected_Foreign','TotalInfected']
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        x = X.copy()
        x = x.rename(columns={'Unnamed: 0':'index'})
        x = x.drop(['ID', 'Date', 'District'],axis=1)
        #x = x.loc[:,self.col_order]
        return x

In [6]:
class WeatherTuner(BaseEstimator,TransformerMixin):

    def __init__(self):
        self.col_order = ['index','formal_date','quarter','date','day_of_week',
                          'month','Suspected_Local','Suspected_Foreign',
                          'temp','humidity','sun_hours','TotalInfected']
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        x = X.copy()
        x = x.merge(info_df, on=["month","index"])
        #x = x.loc[:,self.col_order]
        return x

In [7]:
tuning_pipe = Pipeline([
    ('date_adder',DateEncoder()),
    ('pre-tuner',DatasetTuner()),
    ('weather-tuner', WeatherTuner())
])

inter_res_1 = tuning_pipe.fit_transform(daily_df)

In [8]:
class ProvinceTuner(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        self.province_dist_map = [1,1,1,2,2,2,3,3,4,4,4,5,5,6,6,6,7,7,8,8,9,9,9,9,9]
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        x = X.copy()
        x['province'] = x['index'].apply(lambda x: self.pr_ds_mapper(x))
        return x

    def pr_ds_mapper(self,x):
        return self.province_dist_map[int(x)-1]

In [9]:
class NeighbourTuner(BaseEstimator,TransformerMixin):

    def __init__(self):
        self.srr = {1:[2,3,7,8],2:[1,3,8,17,18],3:[1,4,7],4:[3,5,7],5:[4,6,7],
           6:[5,7,13,15],7:[1,3,4,5,6,8,10,12,13],8:[1,2,7,9,10,17],
           9:[8,10,11,12,17],10:[7,8,9,12],11:[9,12,15,17,19,20],
           12:[7,9,10,11,15,13],13:[6,7,12,15],14:[15,16,20],15:[6,11,12,13,14,20],
           16:[14,19,20,22,25],17:[2,8,9,11,18,19],18:[2,17,19,24],
           19:[11,16,17,18,20,22,24],20:[5,14,15,16,19],21:[23],
           22:[16,19,24,25],23:[21,24,25],24:[18,19,22,23,25],25:[16,22,23,24]}

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        self.x = X.copy()
        dates = self.x['formal_date'].unique()
        count_ser = self.repeater(dates)
        new_lis = count_ser.values
        if (self.x.shape[0] != len(count_ser.values)):
            diff = self.x.shape[0]-len(count_ser.values)
            new_lis = [0,]*diff
            new_lis = new_lis + list(count_ser.values)

        self.x['neir_infected_cases'] = new_lis
        return self.x


    
    def get_date_infected(self,date):
        date_inf = [0]+list(self.x[(self.x.loc[:,'formal_date'] == date)]['TotalInfected'].values)
        if (len(date_inf) != 26):
            date_inf = [0,]*26
        return date_inf

    def coverted_infected(self,date):
        infs_day_before = self.get_date_infected(date)

        inf_lis = []
        for i in range(1,26):
            nei_lis = self.srr[i]
            c = 0
            for j in nei_lis:
                c += infs_day_before[j]
            inf_lis.append(c)
        inf_ser = pd.Series(data=inf_lis,index=range(1,26))
        return inf_ser

    def repeater(self,dates):
        for j in range(len(dates)):
            if(j == 0):
                fin_ser = self.coverted_infected(dates[j])
            else:
                fin_ser = pd.concat([fin_ser,self.coverted_infected(dates[j])])
        return fin_ser

In [10]:
class PoliceTuner(BaseEstimator,TransformerMixin):

    def __init__(self):
        self.col_order = ['index','formal_date','quarter','date','day_of_week',
                          'month','Suspected_Local','Suspected_Foreign',
                          'temp','humidity','sun_hours','province',
                          'neir_infected_cases','locked_percn','TotalInfected']
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        x = X.copy()
        police_df['formal_date'] = police_df['formal_date'].astype(x['formal_date'].dtype)
        x = x.merge(police_df, on=["index","formal_date"])
        x['locked_percn'].fillna(0)
        x = x.loc[:,self.col_order]
        return x

In [11]:
dist_pipe = Pipeline([
    ('province_tuner',ProvinceTuner()),
    ('neighbour_tuner',NeighbourTuner()),
    ('police_tuner',PoliceTuner()),
])
inter_res_2 = dist_pipe.fit_transform(inter_res_1)

In [12]:
inter_res_2=inter_res_2.sort_values(by='formal_date',ascending=True)
inter_res_2.reset_index(inplace=True)
inter_res_2.drop('level_0',axis=1,inplace=True)
inter_res_2.head()

inter_res_2=inter_res_2.fillna(0.5)
inter_res_2.isna().sum()

Unnamed: 0,index,formal_date,quarter,date,day_of_week,month,Suspected_Local,Suspected_Foreign,temp,humidity,sun_hours,province,neir_infected_cases,locked_percn,TotalInfected
0,1,2020-03-31,1,31,1,3,41,1,28.5,81.0,7.15,1,34,1.0,11
1,25,2020-03-31,1,31,1,3,0,0,30.0,78.0,6.15,9,1,1.0,0
2,24,2020-03-31,1,31,1,3,0,0,29.5,82.0,5.74,9,1,1.0,0
3,23,2020-03-31,1,31,1,3,0,0,30.0,81.0,4.58,9,0,1.0,0
4,22,2020-03-31,1,31,1,3,0,0,31.0,81.0,4.87,9,1,1.0,0


index                  0
formal_date            0
quarter                0
date                   0
day_of_week            0
month                  0
Suspected_Local        0
Suspected_Foreign      0
temp                   0
humidity               0
sun_hours              0
province               0
neir_infected_cases    0
locked_percn           0
TotalInfected          0
dtype: int64

In [13]:
inter_res_2.to_csv('../../../data/main_data/final.csv',index=False)

testset generating code

In [14]:
test_df_1  = pd.DataFrame(data={
    'index': range(1,26),
    'formal_date' : datetime.now() - timedelta(days=10)
})
test_df_1['quarter'] = test_df_1['formal_date'].dt.quarter
test_df_1['date'] = test_df_1['formal_date'].dt.day
test_df_1['day_of_week'] = test_df_1['formal_date'].dt.dayofweek
test_df_1['month'] = test_df_1['formal_date'].dt.month
test_df_1.head()

Unnamed: 0,index,formal_date,quarter,date,day_of_week,month
0,1,2020-10-23 10:32:56.822982,4,23,4,10
1,2,2020-10-23 10:32:56.822982,4,23,4,10
2,3,2020-10-23 10:32:56.822982,4,23,4,10
3,4,2020-10-23 10:32:56.822982,4,23,4,10
4,5,2020-10-23 10:32:56.822982,4,23,4,10


In [28]:
class TestTuner(BaseEstimator,TransformerMixin):

    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        test_df_1  = pd.DataFrame(data={
            'index': range(1,26),
            'formal_date' : datetime.now() - timedelta(days=10)
        })
        test_df_1['quarter'] = test_df_1['formal_date'].dt.quarter
        test_df_1['date'] = test_df_1['formal_date'].dt.day
        test_df_1['day_of_week'] = test_df_1['formal_date'].dt.dayofweek
        test_df_1['month'] = test_df_1['formal_date'].dt.month
        return test_df_1

In [29]:
class YesterdayTuner(BaseEstimator,TransformerMixin):

    def __init__(self,source):
        self.source = source

    def fit(self,X,y=None):
        x = X.copy()
        day_before = x.at[0,'formal_date']
        filtered_df = self.source[self.source['formal_date'] == day_before]


    def tranform(self,X):
        x = X.copy()
        return x

In [30]:
test_pipe = Pipeline([
    ('test_tuner',TestTuner()),
    ('weather-tuner', WeatherTuner()),
    ('province_tuner',ProvinceTuner()),
])

In [31]:
test_pipe.fit_transform(pd.DataFrame())

Unnamed: 0,index,formal_date,quarter,date,day_of_week,month,temp,humidity,sun_hours,province
0,1,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,4.35,1
1,2,2020-10-23 10:37:04.366906,4,23,4,10,28.0,,5.03,1
2,3,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,4.35,1
3,4,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,4.3,2
4,5,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,4.35,2
5,6,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,5.95,2
6,7,2020-10-23 10:37:04.366906,4,23,4,10,27.0,,6.04,3
7,8,2020-10-23 10:37:04.366906,4,23,4,10,26.0,,6.08,3
8,9,2020-10-23 10:37:04.366906,4,23,4,10,25.0,,5.32,4
9,10,2020-10-23 10:37:04.366906,4,23,4,10,16.0,,5.77,4
