In [None]:
import pickle
import os
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import country_converter as coco
cc = coco.CountryConverter()

In [None]:
df_weights = pd.read_csv('../data/ref/parsed_intnt_pop.csv')
df_region = pd.read_csv('../data/ref/parsed_country_region.csv')

In [None]:
class Transform:
    # Requirements:
        # import os
        # import pickle
        # import numpy as np
        # import pandas as pd
        # import matplotlib.pyplot as plt
    
    def __init__(self, keyword):
        self.keyword = keyword

    def agg(self, df, kind, **kwargs):
        if (kind=='arithmetic mean'):
            return df.mean(axis=0)
        
        if kind=='weighted mean':
            temp = df.merge(kwargs['weights'], how='left', on=kwargs['key'])
            temp['intnt_pop_norm'] = temp['intnt_pop'].apply(lambda x: x / temp['intnt_pop'].sum())
            for col in df.convert_dtypes().select_dtypes(include=['int', 'float']).columns:
                temp[col] = temp[col] * temp['intnt_pop_norm']
            grouped = temp.drop(['intnt_pop_norm', 'intnt_pop'], axis=1).groupby('region').mean()
            return grouped
        
    def cleanColumns(self, df, grouping):
        df.columns = df.columns.str.replace('\"', '').str.replace(' ', '_')
        df['date'] = pd.to_datetime(df['date']).sort_values()
        df = df.reset_index(drop=True)
        
        # Reordering columns to have 'date' as the first column
        col_order = ['date'] + (df.columns.tolist()[:-1])
        df = df[col_order]
        
        # Converting all column names to lower case and shuffling keyword order
        new_cols = ['date']
        for col in df.columns[1:]:
            if grouping == 'global':
                brand, denom = col.lower().split('_')
                new_col = denom + '_' + brand
            else:
                region, brand, denom = col.lower().split('_')
                new_col = denom + '_' + brand + '_' + region
            new_cols.append(new_col)
        new_df = df.rename(columns=dict(zip(df.columns,new_cols)))
        return new_df
    
    def aggFromPickles(self, inputPath, aggFunc, grouping=pd.DataFrame({'Global': [0,0]}), **kwargs):
        # grouping(pd.DataFrame()): dataframe denoting regions to be grouped by, defaults to global
        # aggFunc(str): aggregation method of choice (e.g. 'arithmetic mean')
        df = pd.DataFrame()
        for root, _, files in os.walk(inputPath):
            for file in sorted(files):
                if file.endswith(".pkl"):
                    year = file.split('-')[0]
                    month = file.split('-')[1]
                    with open(os.path.join(root, file), 'rb') as f:
                        df_temp = pickle.load(f)
                
                df_temp = df_temp.rename(columns={'geoName': 'country_code'})
                df_temp = df_temp.loc[df_temp['country_code']!='U.S. Outlying Islands', :]
                df_temp['country_code'] = cc.convert(names=df_temp['country_code'].tolist(),
                                                     to='ISO3',
                                                     not_found='not there')
                df_temp = df_temp.loc[df_temp['country_code']!='not there', :]
                
                if grouping.equals(pd.DataFrame({'Global': [0,0]})):
                    self.grouping = 'global'
                    df_temp = pd.DataFrame(self.agg(df_temp.iloc[:, 1:], aggFunc)).T
                    
                else:
                    df_temp_merged = df_temp.merge(grouping, on='country_code')
                    
                    if aggFunc == 'weighted mean':
                        temp_weights = kwargs['weights']
                        temp_key = kwargs['key']
                        df_temp = self.agg(df_temp_merged, aggFunc, weights=temp_weights, key=temp_key)
                        df_temp = df_temp.reset_index() \
                                         .melt(id_vars='region')
                    else:
                        df_temp = df_temp_merged.groupby(['region']) \
                                                .apply(self.agg, aggFunc) \
                                                .reset_index() \
                                                .melt(id_vars='region')
                    df_temp['Group'] = df_temp['region'] + '_' + df_temp['variable']
                    df_temp = df_temp.rename(columns={'value': aggFunc})
                    df_temp = df_temp[['Group', aggFunc]].set_index('Group').T
                df_temp['date'] = year + '-' + month
                df = df.append(df_temp)
        
        df = self.cleanColumns(df, self.grouping)
        self.df = df
    
    def lineplot(self):
        fig, ax = plt.subplots(figsize=(10,7))
        for i in range(1, len(self.df.columns)):
            ax.plot(self.df.iloc[:,0], self.df.iloc[:, i], label=self.df.columns[i])
        # plt.xticks(np.arange(0, len(self.df), 12), 
                   # list(map(str, self.df['date'].dt.year.unique())), 
                   # rotation=60)
        ax.set_ylim(bottom=0)
        ax.set_xlabel('Year')
        ax.set_ylabel('Mean')
        ax.set_title('Keyword: \'{}\''.format(self.keyword.capitalize()))
        ax.legend()
    
    def toPickle(self, outputPath):
        fileName = 'global-arimean-{}'.format(self.keyword)
        pathName = '{}/{}.pkl'.format(outputPath, fileName)
        with open(pathName, 'wb') as f:
            pickle.dump(self.df, f)

    def toCSV(self, outputPath, aggFunc, grouping):
        fileName = '{}-{}-{}'.format(self.keyword, aggFunc, grouping)
        pathName = '{}/{}.csv'.format(outputPath, fileName)
        self.df.to_csv(pathName, index=False)

class Transform_keyword(Transform):
# Subclass of class Transform
    def __init__(self, keyword):
        super().__init__(keyword)

    def path(self):
        return '../data/raw/{}'.format(self.keyword)

    def aggFromPickles(self):
        super().aggFromPickles(self.path())

In [None]:
t = Transform('mirrorless')

In [None]:
t.aggFromPickles(inputPath='../data/raw/mirrorless', aggFunc='arithmetic mean')
#                 grouping=df_region, weights= df_weights, key='country_code')

In [None]:
t.df

In [None]:
t.lineplot()

In [None]:
t.toCSV('../data/agged', 'arimean', 'global')

In [None]:
def main():
    t_cam = Transform_keyword('camera')
    t_cam.aggFromPickles()
    t_cam.lineplot()
    t_cam.toCSV('../data/agged/', 'arimean', 'global')

    t_dslr = Transform_keyword('dslr')
    t_dslr.aggFromPickles()
    t_dslr.lineplot()
    t_dslr.toCSV('../data/agged/', 'arimean', 'global')

    t_ml = Transform_keyword('mirrorless')
    t_ml.aggFromPickles()
    t_ml.lineplot()
    t_ml.toCSV('../data/agged/', 'arimean', 'global')

if __name__ == '__main__':
    main()