In [1]:
import csv

In [2]:
import pandas as pd 
import numpy as np 
import scipy as sp 

In [3]:
# In fact, we use the pd.datetime 
import datetime 
import time 

In [4]:
class company():
    # Timeformat, pd datetime, convenient to use with the datetime array. 
    start = pd.to_datetime('1970-01-01',format='%Y-%m-%d')
    end = pd.to_datetime('2023-11-21',format='%Y-%m-%d')
    frequency = pd.Timedelta(days=1) 
    
    # We need the ticker name of the company to get started 
    def __init__(self,ticker,autofill=True):
        self.ticker = ticker  
        self.constrain_time_range()
        self.generate_time_series()

    def __repr__(self):
        return self.ticker
    def __str__(self):
        return "Company Class :{}".format(self.ticker)
 
    # Start "%Y-%m-%d" , End "%Y-%m-%d" , Frequency N days. 
    def update_time_range(self,start,end,frequency) : 
        self.start = pd.to_datetime(start,format='%Y-%m-%d')
        self.end = pd.to_datetime(end,format='%Y-%m-%d')
        self.frequency = pd.Timedelta(days=frequency)
        self.constrain_time_range()
        self.generate_time_series()

        
    def constrain_time_range(self):
        # The temp is to save the readout data 
        temp = pd.read_csv('../data/price/{}_price.csv'.format(self.ticker))
        
        # Filter the data within the interesting time  area. 
        referdate = pd.to_datetime( temp['Date'], format='%Y-%m-%d') 
        
        price_start = np.min(np.array(referdate)) 
        price_end = np.max(np.array(referdate)) 
        
        if self.start < price_start : 
            self.start = price_start 
        
        if self.end > price_end : 
            self.end = price_end 

        
    def generate_time_series(self): 
        data_points_number = int((self.end - self.start)/(self.frequency)) +1 
        self.time_series = np.array([ x*self.frequency + self.start for x in range(data_points_number) ])
    
    
    
    
    # To verify the integrity of the data .... 
    def feature_list(self): 
        pass 
    
    def print_feature_list(self):
        pass 
    
    def check_data(self):
        pass 


    def read_data(self,target_list):
        query_result = np.zeros((len(target_list), len(self.time_series) ))
        if len(target_list) == 1 : 
            # the 2d numpy array (1,n) behaves quite strange 
            query_result = self.read_single_data(target_list[0])
        else :
            for i in range(len(target_list)):
                query_result[i]= self.read_single_data(target_list[i])
        
        return (self.time_series,query_result)
    
    def read_single_data(self,target):
        if target[0]=='price':
            return self.read_single_price_data(target)
        elif target[0]=='balance' : 
            return self.read_single_balance_data(target)
        elif target[0]=='cashflow' : 
            return self.read_single_cashflow_data(target)
        elif target[0]=='esg' : 
            return self.read_single_esg_data(target)
        elif target[0]=='financials':
            return self.read_single_financials_data(target)
        elif target[0]=='valuation':
            return self.read_single_valuation_data(target)
        else : 
            print("Warning, make sure that you type the right feature name")
    
    def read_single_price_data(self,target):
        # The temp is to save the readout data 
        temp = pd.read_csv('../data/price/{}_price.csv'.format(self.ticker))
        
        # Filter the data within the interesting time  area. 
        referdate = pd.to_datetime( temp['Date'], format='%Y-%m-%d') 
        temp_date = referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data = np.array(temp[target[1]][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])
        
        return return_data

    
    def read_single_balance_data(self,target):
        temp = pd.read_csv('../data/balance/{}_balance.csv'.format(self.ticker))
        
        row_name = temp['name']
        n_row = len(row_name)
        target_row = 0 
        
        # find the columns we want 
        for i in range(n_row):
            if row_name[i].strip() == target[1] : 
                target_row = i 
                break 
        
        # Filter the data within the 
        referdate = pd.to_datetime( np.array(temp.columns[1:]) ,format='%m/%d/%Y' )
        temp_date= referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data= np.array(temp.iloc[i][1:][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])
        return_data = [ float(i.replace(',','')) for i in return_data ]
        return return_data
    
    def read_single_cashflow_data(self,target):
        temp = pd.read_csv('../data/cashflow/{}_cashflow.csv'.format(self.ticker))
        
        row_name = temp['name']
        n_row = len(row_name)
        target_row = 0 
        
        # find the columns we want 
        for i in range(n_row):
            if row_name[i].strip() == target[1] : 
                target_row = i 
                break 
        
        # Filter the data within the 
        referdate = pd.to_datetime( np.array(temp.columns[2:]) ,format='%m/%d/%Y' )
        temp_date= referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data= np.array(temp.iloc[i][2:][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])     
        return_data = [ float(i.replace(',','')) for i in return_data ]
        return return_data 
 
    
    def read_single_financials_data(self,target):
        temp = pd.read_csv('../data/financials/{}_financials.csv'.format(self.ticker))
        
        row_name = temp['name']
        n_row = len(row_name)
        target_row = 0 
        
        # find the columns we want 
        for i in range(n_row):
            if row_name[i].strip() == target[1] : 
                target_row = i 
                break 
        
        # Filter the data within the 
        referdate = pd.to_datetime( np.array(temp.columns[2:]) ,format='%m/%d/%Y' )
        temp_date= referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data= np.array(temp.iloc[i][2:][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])     
        return_data = [ float(i.replace(',','')) for i in return_data ]
        return return_data  
    
    def read_single_valuation_data(self,target):
        temp = pd.read_csv('../data/valuation/{}_valuation.csv'.format(self.ticker))
        
        row_name = temp['name']
        n_row = len(row_name)
        target_row = 0 
        
        # find the columns we want 
        for i in range(n_row):
            if row_name[i].strip() == target[1] : 
                target_row = i 
                break 
        
        # Filter the data within the time range 
        referdate = pd.to_datetime( np.array(temp.columns[2:]) ,format='%m/%d/%Y' )
        temp_date= referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data= np.array(temp.iloc[i][2:][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])     
        return_data = [ float(i.replace(',','')) for i in return_data ]
        return return_data  

    
    
    def read_single_esg_data(self,target):
        temp = pd.read_csv('../data/esg/{}_esg.csv'.format(self.ticker))        
        target_row = np.where(np.array(temp['aspectname'])== target[1])[0]
        
        # Filter data 
        referdate = pd.to_datetime( np.array(temp.iloc[target_row]['scoredate']) ,format='%Y-%m-%d' )
        temp_date= referdate[ np.array( (referdate <= self.end) &  (referdate >= self.start) )]
        temp_data= np.array(temp.iloc[target_row]['scorevalue'][ np.array( (referdate <= self.end) &  (referdate >= self.start) )])
        
        # Not Every data works, and then we will rebuild the data according to the the time series 
        # For the timebeing, we will use the uniform choice method 
        return_args = np.array( np.arange(len(self.time_series))* len(temp_date) / len(self.time_series) ).astype(int)
        return_data = np.array(temp_data[return_args])     
        return return_data 
    
    def plot_single_data(self,target):
        pass 

In [13]:
start_run_time = datetime.datetime.now() 
a= company('AAPL')
a.update_time_range('1970-01-01','2023-11-21',1)
x= a.read_data([('price','Close'),('balance','CurrentAssets'),('esg','S&P Global ESG Score')])
end_run_time = datetime.datetime.now() 
print(len(x[0]))

14938


In [113]:
target_row

array([  1, 124, 247, 378, 509, 638, 767, 889])

In [118]:
        # The temp is to save the readout data 
        temp = pd.read_csv('../data/price/{}_price.csv'.format(self.ticker))
        
        # Filter the data within the interesting time  area. 
        referdate = pd.to_datetime( temp['Date'], format='%Y-%m-%d') 

NameError: name 'self' is not defined

In [120]:
temp = pd.read_csv('../data/price/{}_price.csv'.format("AAPL"))

In [121]:
referdate = pd.to_datetime( temp['Date'], format='%Y-%m-%d') 

In [124]:
np.min(np.array(referdate)) < referdate[1]

True

In [114]:
temp.iloc[target_row]

1       0.0
124     0.0
247     0.0
378     0.0
509    90.0
638    90.0
767    90.0
889    90.0
Name: scorevalue, dtype: float64

In [92]:
int(x.replace(',',''))

AttributeError: 'numpy.ndarray' object has no attribute 'replace'

In [93]:
np.char.replace(x,',','')

TypeError: string operation on non-string array

In [88]:
a.read_single_data(('balance','CurrentAssets')).astype(int)

ValueError: invalid literal for int() with base 10: '122,659,000,000'

In [86]:
len([('price','Close'),('balance','CurrentAssets')])

2

In [105]:
temp = pd.read_csv("../data/esg/A_esg.csv")
temp.iloc[x]['scoredate']

1      2013-09-12
124    2013-09-12
247    2014-09-11
378    2014-09-11
509    2015-09-10
638    2015-09-10
767    2016-09-08
889    2016-09-08
Name: scoredate, dtype: object

In [81]:
temp.iloc[0]

institutionid                       4075849
scoredate                        2013-09-12
scoretype              S&P Global ESG Score
aspectname             S&P Global ESG Score
scorevalue                             61.0
ticker                                    A
companyname      Agilent Technologies, Inc.
Name: 0, dtype: object

In [106]:
target_row = np.where(np.array(temp['aspectname'])== 'Climate Change Governance')[0]

In [110]:
pd.to_datetime( np.array(temp.iloc[target_row]['scoredate']) ,format='%Y-%m-%d' )[0]

Timestamp('2013-09-12 00:00:00')

In [108]:
np.array(temp.iloc[target_row]['scoredate'])

array(['2013-09-12', '2013-09-12', '2014-09-11', '2014-09-11',
       '2015-09-10', '2015-09-10', '2016-09-08', '2016-09-08'],
      dtype=object)

In [65]:
 pd.to_datetime( np.array(temp.columns[2:]) ,format='%m/%d/%Y' )

DatetimeIndex(['2023-07-31', '2023-04-30', '2023-01-31', '2022-10-31',
               '2022-07-31', '2022-04-30', '2022-01-31', '2021-10-31',
               '2021-07-31', '2021-04-30', '2021-01-31', '2020-10-31',
               '2020-07-31', '2020-04-30', '2020-01-31', '2019-10-31',
               '2019-07-31', '2019-04-30', '2019-01-31', '2018-10-31',
               '2018-07-31', '2018-04-30', '2018-01-31', '2017-10-31',
               '2017-07-31', '2017-04-30', '2017-01-31', '2016-10-31',
               '2016-07-31', '2016-04-30', '2016-01-31', '2015-10-31',
               '2015-07-31', '2015-04-30', '2015-01-31', '2014-10-31',
               '2014-07-31', '2014-04-30', '2014-01-31', '2013-10-31',
               '2013-07-31', '2013-04-30', '2013-01-31', '2012-10-31',
               '2012-07-31', '2012-04-30', '2012-01-31', '2011-10-31',
               '2011-07-31', '2011-04-30', '2011-01-31', '2010-10-31',
               '2010-07-31', '2010-04-30', '2010-01-31', '2009-10-31',
      

In [20]:
pd.to_datetime( temp.columns[1:] , '%m/%d/%Y' ) 

AssertionError: 

In [29]:
pd.to_datetime( np.array(temp.columns[1:]) , '%m/%d/%Y' ) 

AssertionError: 

In [33]:
np.array(temp.columns[1:]).astype(datetime64)

NameError: name 'datetime64' is not defined

In [32]:
np.array(temp.columns[1:]).astype(str)

array(['07/31/2023', '04/30/2023', '01/31/2023', '10/31/2022',
       '07/31/2022', '04/30/2022', '01/31/2022', '10/31/2021',
       '07/31/2021', '04/30/2021', '01/31/2021', '10/31/2020',
       '07/31/2020', '04/30/2020', '01/31/2020', '10/31/2019',
       '07/31/2019', '04/30/2019', '01/31/2019', '10/31/2018',
       '07/31/2018', '04/30/2018', '01/31/2018', '10/31/2017',
       '07/31/2017', '04/30/2017', '01/31/2017', '10/31/2016',
       '07/31/2016', '04/30/2016', '01/31/2016', '10/31/2015',
       '07/31/2015', '04/30/2015', '01/31/2015', '10/31/2014',
       '07/31/2014', '04/30/2014', '01/31/2014', '10/31/2013',
       '07/31/2013', '04/30/2013', '01/31/2013', '10/31/2012',
       '07/31/2012', '04/30/2012', '01/31/2012', '10/31/2011',
       '07/31/2011', '04/30/2011', '01/31/2011', '10/31/2010',
       '07/31/2010', '04/30/2010', '01/31/2010', '10/31/2009',
       '07/31/2009', '04/30/2009', '01/31/2009', '10/31/2008',
       '07/31/2008', '04/30/2008', '01/31/2008', '10/31

In [27]:
pd.DataFrame(temp.columns[1:])[0]

0     07/31/2023
1     04/30/2023
2     01/31/2023
3     10/31/2022
4     07/31/2022
         ...    
91    10/31/2000
92    07/31/2000
93    04/30/2000
94    01/31/2000
95    10/31/1999
Name: 0, Length: 96, dtype: object

In [42]:
pd.to_datetime( np.array(temp.columns[1:]) ,format='%m/%d/%Y' )

Timestamp('2023-07-31 00:00:00')

In [16]:
temp['name'][0].strip()

'TotalAssets'

In [181]:
target_list=[('price','Close')]

In [183]:
query_result = np.zeros((len(target_list), len(a.time_series) )).reshape()

In [186]:
np.shape(query_result)

(1, 19683)

In [2]:
import numpy as np

In [3]:
np.arange(8)[(np.arange(14) * 8 / 14).astype(int)]

array([0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 7])

In [72]:
np.array(a_temp['Date'])

array(['1982-11-30', '1982-12-01', '1982-12-02', ..., '2023-10-19',
       '2023-10-20', '2023-10-23'], dtype=object)

In [152]:
a.start + range( int((a.end - a.start ) / pd.Timedelta(days=1)) ) 

range(0, 19682)

In [38]:
a = pd.read_csv("../data/price/AAPL_price.csv")

In [39]:
a['Date']= pd.to_datetime(a['Date'],format='%Y-%m-%d')

In [99]:
b =pd.to_datetime(a_temp['Date'],format='%Y-%m-%d')

In [102]:
c= b + pd.Timedelta(days=1)

0        True
1        True
2        True
3        True
4        True
         ... 
10305    True
10306    True
10307    True
10308    True
10309    True
Name: Date, Length: 10310, dtype: bool

In [103]:
c -b 

0       1 days
1       1 days
2       1 days
3       1 days
4       1 days
         ...  
10305   1 days
10306   1 days
10307   1 days
10308   1 days
10309   1 days
Name: Date, Length: 10310, dtype: timedelta64[ns]

In [105]:
pd.to_datetime('1970-01-01',format='%Y-%m-%d') + pd.Timedelta(days=1)

Timestamp('1970-01-02 00:00:00')

In [98]:
pd.Timestamp(b[0])

Timestamp('1982-11-30 00:00:00')

In [90]:
datetime.datetime(np.array(b)[0])

TypeError: 'numpy.datetime64' object cannot be interpreted as an integer

In [88]:
datetime.datetime.fromtimestamp(b[0])

TypeError: 'Timestamp' object cannot be interpreted as an integer

In [75]:
np.array(pd.to_datetime(a_temp['Date']) , )

AttributeError: 'numpy.ndarray' object has no attribute 'date'

In [20]:


# datetime.strptime() : From string to datetime 

In [21]:
converted_date = datetime.datetime.strptime(np.array(a['Date'])[0],'%Y-%m-%d').date()
# %Y four digit year , %m two digit month , %d two digit date

added_date = converted_date + datetime.timedelta(days=1)
# add time 


# Calculate the time difference 

datetime.date(1982, 12, 1)