# Data preparation

### Install and importing libraries

In [1]:
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 6.4 MB/s 
[?25hCollecting requests>=2.26
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.1 MB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. 

In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import datetime

import base64
from IPython.display import HTML

### Uploading data

In [3]:
url = "https://raw.githubusercontent.com/Agablue-red/Machine-Learning/master/data/CONVICTIONLISTTOPN_BSLD-408.csv"
df = pd.read_csv(url, index_col=False, names=['info', 'date', 'symbol', 'symbol2', 'sector', 'number', 'score'])
df

Unnamed: 0,info,date,symbol,symbol2,sector,number,score
0,10:01:54.481 77425 [77425-thread-2] INFO a.s....,2004-02-11,SU,SU,Energy Minerals,GN63J3-R,0.953727
1,10:01:54.481 77425 [77425-thread-2] INFO a.s....,2004-02-11,GGG,GGG,Producer Manufacturing,H5490W-R,0.952753
2,10:01:54.481 77425 [77425-thread-2] INFO a.s....,2004-02-11,WGR,WGR,Energy Minerals,V0622Q-R,0.947634
3,10:01:54.481 77425 [77425-thread-2] INFO a.s....,2004-02-11,CWT,CWT,Utilities,GSWXLY-R,0.934181
4,10:01:54.481 77425 [77425-thread-2] INFO a.s....,2004-02-11,BLL,BLL,Process Industries,VFT0VQ-R,0.922862
...,...,...,...,...,...,...,...
37355,10:27:03.049 77425 [77425-thread-2] INFO a.s....,2022-02-09,PEP,PEP,Consumer Non-Durables,PPCTFP-R,0.701507
37356,10:27:03.049 77425 [77425-thread-2] INFO a.s....,2022-02-09,SSNC,SSNC,Technology Services,G92RX2-R,0.701123
37357,10:27:03.049 77425 [77425-thread-2] INFO a.s....,2022-02-09,GEF,GEF,Process Industries,MPX0N4-R,0.697954
37358,10:27:03.049 77425 [77425-thread-2] INFO a.s....,2022-02-09,DPZ,DPZ,Consumer Services,F05QG0-R,0.697741


In [4]:
# Removing column data
df.drop(['info','symbol2','number'], axis=1, inplace=True)

In [5]:
# Convert argument to datetime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,symbol,sector,score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-02-11,SU,Energy Minerals,0.953727
2004-02-11,GGG,Producer Manufacturing,0.952753
2004-02-11,WGR,Energy Minerals,0.947634
2004-02-11,CWT,Utilities,0.934181
2004-02-11,BLL,Process Industries,0.922862


### Information about dataset

In [7]:
print('Shape of raw dataset: {}'.format(df.shape))

Shape of raw dataset: (37360, 3)


In [8]:
# Return the data type of each column
df.dtypes

symbol     object
sector     object
score     float64
dtype: object

In [9]:
print('Number of unique dates: {}'.format(df.index.nunique()))

Number of unique dates: 467


In [10]:
# Return the number of missing values
df.isnull().sum()

symbol    0
sector    0
score     0
dtype: int64

In [11]:
print('Number of duplicate rows: {}'.format(df.duplicated().sum()))

Number of duplicate rows: 0


In [12]:
df.symbol.unique()

array(['SU', 'GGG', 'WGR', ..., 'DELL', 'BOOT', 'AGCO'], dtype=object)

In [13]:
print('Number of unique symbols: {}'.format(df.symbol.nunique()))

Number of unique symbols: 1834


In [14]:
df.sector.unique()

array(['Energy Minerals', 'Producer Manufacturing', 'Utilities',
       'Process Industries', 'Consumer Services', 'Transportation',
       'Retail Trade', 'Finance', 'Health Technology', 'Miscellaneous',
       'Non-Energy Minerals', 'Distribution Services',
       'Consumer Non-Durables', 'Commercial Services',
       'Technology Services', 'Consumer Durables', 'Health Services',
       'Electronic Technology', 'Industrial Services', 'Communications'],
      dtype=object)

In [15]:
print('Number of unique sectors: {}'.format(df.sector.nunique()))

Number of unique sectors: 20


In [16]:
# basic statistics
df.score.describe()

count    37360.000000
mean         0.731634
std          0.118071
min          0.413554
25%          0.655228
50%          0.743032
75%          0.813181
max          0.987225
Name: score, dtype: float64

### Download Financial Data from Yahoo

In [17]:
# delete an unnecessary part in the 'symbol' column
df['symbol'] = df['symbol'].str.replace(".", " ")
df['symbol'] = df['symbol'].str.split(' ')

xyz = []
for x in df["symbol"].to_numpy():
  xyz.append(x[0])
df["symbol"] = xyz

  


In [18]:
tickers = df.symbol.unique()
list_tickers = tickers.tolist()
Symbol = yf.Tickers(list_tickers)

In [19]:
df_yahoo = yf.download(list_tickers, start='2004-02-10', end='2022-02-10', interval="1d")['Close']

[*********************100%***********************]  1804 of 1804 completed

413 Failed downloads:
- NTLS: No data found for this date range, symbol may be delisted
- CHSI: No data found for this date range, symbol may be delisted
- EE: Data doesn't exist for startDate = 1076371200, endDate = 1644451200
- CATM: No data found, symbol may be delisted
- BBX: No data found, symbol may be delisted
- WYE: No data found for this date range, symbol may be delisted
- DFODQ: No data found, symbol may be delisted
- LAACZ: No data found, symbol may be delisted
- LEARQ: No data found, symbol may be delisted
- QLGC: No data found for this date range, symbol may be delisted
- ABI: No data found for this date range, symbol may be delisted
- PIKE: No data found for this date range, symbol may be delisted
- BPL: No data found, symbol may be delisted
- TLRDQ: No data found, symbol may be delisted
- NGLS: No data found for this date range, symbol may be delisted
- RAVN: No data found, symbol may be deliste

In [20]:
df_yahoo.head(5)

Unnamed: 0_level_0,A,AACB,AAIC,AAP,AAPL,AAT,AAWW,ABBV,ABC,ABCD,...,XTO,XYL,YELL,YLWDF,YUM,ZBRA,ZD,ZLC,ZQKSQ,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-02-09,,,,,,,,,,,...,,,,,,,,,,
2004-02-10,26.731045,,500.600006,27.280001,0.410357,,,,14.1675,,...,,,239025.0,,11.926671,43.740002,9.769565,,,
2004-02-11,26.752504,,514.799988,27.753332,0.425,,,,14.25,,...,,,244200.0,,12.117182,47.119999,10.5,,,
2004-02-12,26.409157,,515.400024,28.046667,0.42375,,,,14.145,,...,,,247500.0,,12.67793,46.866669,10.326087,,,
2004-02-13,26.523605,,512.0,27.933332,0.410714,,,,14.1525,,...,,,245775.0,,12.692308,46.453335,10.186957,,,


In [21]:
print('Shape of dataset from Yahoo: {}'.format(df_yahoo.shape))

Shape of dataset from Yahoo: (4552, 1804)


In [22]:
columns_nan = df_yahoo.columns[df_yahoo.isna().all()].tolist()
print('Number of missing index: {}'.format(len(columns_nan)))

Number of missing index: 413


### Preparation financial data

In [23]:
# create copy DataFrame
data = df_yahoo.copy(deep=True)

In [24]:
# remove missing values from columns
data.dropna(how='any', axis=1, thresh=1, inplace=True)
# remove missing values from rows
data.dropna(how='any', axis=0, thresh=3, inplace=True)

In [25]:
data_ = data.reset_index()

In [26]:
# unpivot a DataFrame
data2 = pd.melt(data_, id_vars='Date', value_vars=data.columns.to_list())
data2

Unnamed: 0,Date,variable,value
0,2004-02-10,A,26.731045
1,2004-02-11,A,26.752504
2,2004-02-12,A,26.409157
3,2004-02-13,A,26.523605
4,2004-02-17,A,26.816881
...,...,...,...
6305398,2022-02-03,ZTS,200.919998
6305399,2022-02-04,ZTS,199.539993
6305400,2022-02-07,ZTS,200.320007
6305401,2022-02-08,ZTS,201.300003


In [27]:
# Return the number of missing values
data2.isnull().sum()

Date             0
variable         0
value       849112
dtype: int64

In [28]:
print('Number of data without missing: {}'.format(len(data2) - data2.value.isnull().sum()))

Number of data without missing: 5456291


In [29]:
# removing missing values 
data2.dropna(inplace=True)

In [30]:
data2.isnull().sum()

Date        0
variable    0
value       0
dtype: int64

In [31]:
print('Number of weekly: {}'.format(data2.Date.nunique()))

Number of weekly: 4533


In [32]:
print('Shape of dataset from Yahoo without empty index: {}'.format(data2.shape))

Shape of dataset from Yahoo without empty index: (5456291, 3)


### Calculation of the rate of return

In [33]:
#create empty columns
data2["return_rate"] = np.nan

#create new DataFrame
df_rr = pd.DataFrame(columns=['Date', 'symbol', 'value', 'return_rate'])

#create symbol list
symbols = data2["variable"].unique().tolist()

for sym in symbols:

    data_symbol = data2.loc[data2["variable"] == sym]

    for i in range(0, len(data_symbol)):
        if i+1<len(data_symbol):
            data_symbol["return_rate"].iloc[i+1] = (data_symbol["value"].iloc[i+1]/data_symbol["value"].iloc[i])-1 
    
    df_rr = pd.concat([df_rr, data_symbol])

df_rr

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  app.launch_new_instance()


Unnamed: 0,Date,symbol,value,return_rate,variable
0,2004-02-10,,26.731045,,A
1,2004-02-11,,26.752504,0.000803,A
2,2004-02-12,,26.409157,-0.012834,A
3,2004-02-13,,26.523605,0.004334,A
4,2004-02-17,,26.816881,0.011057,A
...,...,...,...,...,...
6305398,2022-02-03,,200.919998,-0.006183,ZTS
6305399,2022-02-04,,199.539993,-0.006868,ZTS
6305400,2022-02-07,,200.320007,0.003909,ZTS
6305401,2022-02-08,,201.300003,0.004892,ZTS


In [34]:
# test
df_rr.loc[df_rr['variable'] == "SU"]

Unnamed: 0,Date,symbol,value,return_rate,variable
5321742,2004-02-10,,13.175000,,SU
5321743,2004-02-11,,13.285000,0.008349,SU
5321744,2004-02-12,,12.960000,-0.024464,SU
5321745,2004-02-13,,12.830000,-0.010031,SU
5321746,2004-02-17,,12.985000,0.012081,SU
...,...,...,...,...,...
5326270,2022-02-03,,29.219999,-0.038816,SU
5326271,2022-02-04,,28.719999,-0.017112,SU
5326272,2022-02-07,,28.990000,0.009401,SU
5326273,2022-02-08,,28.469999,-0.017937,SU


In [35]:
#Pivot table
df_width = df_rr.pivot(index='Date', columns='variable', values='return_rate')

In [36]:
#Converting Date column
df_width = df_width.reset_index()
df_width['Date'] =  pd.to_datetime(df_width['Date'])

In [37]:
df_width

variable,Date,A,AAIC,AAP,AAPL,AAT,AAWW,ABBV,ABC,ABCD,...,XPO,XRAY,XRX,XYL,YELL,YLWDF,YUM,ZBRA,ZD,ZTS
0,2004-02-10,,,,,,,,,,...,,,,,,,,,,
1,2004-02-11,0.000803,0.028366,0.017351,0.035684,,,,0.005823,,...,0.000000,0.014908,0.002000,,0.021650,,0.015974,0.077275,0.074766,
2,2004-02-12,-0.012834,0.001166,0.010569,-0.002941,,,,-0.007368,,...,-0.071970,-0.007458,0.015303,,0.013514,,0.046277,-0.005376,-0.016563,
3,2004-02-13,0.004334,-0.006597,-0.004041,-0.030763,,,,0.000530,,...,-0.004082,-0.007741,-0.013761,,-0.006970,,0.001134,-0.008819,-0.013474,
4,2004-02-17,0.011057,0.027734,0.014320,0.006956,,,,0.006889,,...,-0.004098,0.011703,0.016611,,-0.005188,,0.006514,0.000143,0.007256,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4528,2022-02-03,-0.016986,-0.002915,-0.017703,-0.016720,-0.021164,-0.011508,0.015221,0.013228,,...,-0.021798,-0.018840,-0.006790,-0.120273,-0.079717,0.0,0.001278,-0.016847,-0.048428,-0.006183
4529,2022-02-04,-0.004725,0.000000,-0.023657,-0.002950,-0.000285,-0.038933,-0.000568,-0.009211,,...,-0.028693,-0.005703,0.004558,-0.012439,-0.054861,0.0,-0.000638,-0.010250,0.004990,-0.006868
4530,2022-02-07,-0.005315,0.014620,-0.004626,-0.004235,0.001138,0.020321,0.013367,0.020496,,...,-0.000943,-0.003442,-0.015427,-0.011601,0.019348,0.0,-0.001517,-0.009442,0.006951,0.003909
4531,2022-02-08,0.003135,-0.008646,0.018855,0.018467,-0.002274,0.002809,0.006876,0.020300,,...,0.011953,0.016884,-0.014286,0.021350,0.064935,0.0,-0.000879,-0.013324,0.012129,0.004892


In [38]:
#Creating missing date
row_date = [pd.to_datetime('2018-12-05')]
row_nan = np.repeat(np.nan, len(df_width.columns)-1).tolist()
row_new = row_date + row_nan

#Adding missing row
df_width.loc[-1] = row_new
df_width = df_width.fillna(0)

#sorting by date
df_width = df_width.sort_values(by="Date")

In [39]:
#Setting index
df_width.index = range(len(df_width))

In [40]:
#Creating dataframe with dates
df_rates = pd.DataFrame()

var2 = 1
data_list = []
for i in range(len(df_width)):

    if df_width["Date"].iloc[i].dayofweek != 2:
        continue
    elif df_width["Date"].iloc[i].dayofweek == 2:
        var2 = var2+1
        if var2==2:
            var2=0
            data_list.append(df_width["Date"].iloc[i])

df_rates["Date"]=pd.Series(data_list)

In [41]:
#Sum of rates for 2 weeks
for c in range(1, len(df_width.columns)):

    var = 0
    var_list = []
    table_var = df_width[df_width.columns[c]]
    var2 = 1
    
    for i in range(len(table_var)):
        if df_width["Date"].iloc[i].dayofweek != 2:
            var = var + table_var.iloc[i]
        elif df_width["Date"].iloc[i].dayofweek == 2:
            var2 = var2+1
            if var2==2:
                var2=0
                var = var + table_var.iloc[i]
                var_list.append(var)
                var = 0
        df_rates[df_width.columns[c]]=pd.Series(var_list)



In [43]:
#unpivoting tables
df_long = pd.melt(df_rates, id_vars='Date', value_vars=df_rates.columns.to_list())

#Merging tables
df_marged = pd.merge(df, df_long,  how='left', left_on=['date','symbol'], right_on = ['Date','variable'])



In [44]:
df_marged

Unnamed: 0,symbol,sector,score,Date,variable,value
0,SU,Energy Minerals,0.953727,2004-02-11,SU,0.008349
1,GGG,Producer Manufacturing,0.952753,2004-02-11,GGG,0.011734
2,WGR,Energy Minerals,0.947634,NaT,,
3,CWT,Utilities,0.934181,2004-02-11,CWT,0.004778
4,BLL,Process Industries,0.922862,2004-02-11,BLL,-0.004917
...,...,...,...,...,...,...
37355,PEP,Consumer Non-Durables,0.701507,2022-02-09,PEP,-0.003515
37356,SSNC,Technology Services,0.701123,2022-02-09,SSNC,0.058040
37357,GEF,Process Industries,0.697954,2022-02-09,GEF,-0.023572
37358,DPZ,Consumer Services,0.697741,2022-02-09,DPZ,0.063856


In [45]:
df_yahoo_2w = pd.DataFrame()
df_yahoo_agr = df_yahoo
df_yahoo_agr=df_yahoo_agr.reset_index()
df_yahoo_agr["Date"] = pd.to_datetime(df_yahoo_agr["Date"])

#Creating missing date
row_date = [pd.to_datetime('2018-12-05')]
row_nan = np.repeat(np.nan, len(df_yahoo_agr.columns.values.tolist())-1).tolist()
row_new = row_date + row_nan

#Adding missing row
#df_yahoo_agr.loc[-1] = row_new
df_yahoo_agr = df_yahoo_agr.fillna(0)

#sorting by date
df_yahoo_agr = df_yahoo_agr.sort_values(by="Date")

#Setting index
df_yahoo_agr.index = range(len(df_yahoo_agr))

#Creating dataframe with dates
df_yahoo_2w = pd.DataFrame()

var2 = 1
data_list = []
for i in range(len(df_yahoo_agr)):

    if df_yahoo_agr["Date"].iloc[i].dayofweek != 2:
        continue
    elif df_yahoo_agr["Date"].iloc[i].dayofweek == 2:
        var2 = var2+1
        if var2==2:
            var2=0
            data_list.append(df_yahoo_agr["Date"].iloc[i])

df_yahoo_2w["Date"]=pd.Series(data_list)

#Sum of rates for 2 weeks
for c in range(1, len(df_yahoo_agr.columns)):

    var = []
    var_list = []
    table_var = df_yahoo_agr[df_yahoo_agr.columns[c]]
    var2 = 1
    
    for i in range(len(table_var)):
        if df_yahoo_agr["Date"].iloc[i].dayofweek != 2:
            var.append(table_var.iloc[i])
        elif df_yahoo_agr["Date"].iloc[i].dayofweek == 2:
            var2 = var2+1
            if var2==2:
                var2=0
                var = var + table_var.iloc[i]
                var_list.append(var.mean())
                var = []
        df_yahoo_2w[df_yahoo_agr.columns[c]]=pd.Series(var_list)



In [46]:
df_yahoo_2w

Unnamed: 0,Date,A,AACB,AAIC,AAP,AAPL,AAT,AAWW,ABBV,ABC,...,XTO,XYL,YELL,YLWDF,YUM,ZBRA,ZD,ZLC,ZQKSQ,ZTS
0,2004-02-11,40.118027,0.0,765.099991,41.393332,0.630179,0.000000,0.000000,0.000000,21.333750,...,0.0,0.000000,363712.500000,0.000000,18.080517,68.990000,15.384782,0.0,0.0,0.000000
1,2004-02-25,49.841611,0.0,1065.771441,53.142858,0.813724,0.000000,0.000000,0.000000,28.454643,...,0.0,0.000000,474503.571429,0.000000,25.711719,89.077144,19.506832,0.0,0.0,0.000000
2,2004-03-10,46.909871,0.0,1085.525032,54.147501,0.940246,0.000000,0.000000,0.000000,28.263438,...,0.0,0.000000,458034.375000,0.000000,26.861072,94.012500,19.051087,0.0,0.0,0.000000
3,2004-03-24,42.621603,0.0,1032.650002,52.200000,0.923348,0.000000,0.000000,0.000000,26.795000,...,0.0,0.000000,487659.375000,0.000000,26.611252,90.915000,16.925544,0.0,0.0,0.000000
4,2004-04-07,45.432761,0.0,1016.624992,54.086667,0.979866,0.000000,0.000000,0.000000,27.212188,...,0.0,0.000000,515259.375000,0.000000,27.715224,95.436668,19.399457,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,2021-12-15,307.388748,0.0,7.141250,472.234997,350.072502,71.139999,174.806252,252.979997,243.972497,...,0.0,240.997499,25.945000,22.000000,262.751247,1195.444992,217.409999,0.0,0.0,459.233753
463,2021-12-29,315.267138,0.0,7.007143,474.844288,353.957147,72.711427,183.924287,267.361428,261.421425,...,0.0,236.472858,25.092857,22.000000,273.201429,1189.350002,220.274287,0.0,0.0,485.928567
464,2022-01-12,301.293743,0.0,7.150000,474.668749,351.644999,76.095001,177.021247,272.962492,268.872505,...,0.0,233.215000,22.602500,22.056875,265.078745,1103.969994,219.658754,0.0,0.0,438.091248
465,2022-01-26,273.509992,0.0,6.871429,459.019996,325.887146,72.191426,157.648568,268.005709,263.680004,...,0.0,211.255714,20.824286,22.070000,245.639998,980.082864,210.169996,0.0,0.0,391.767140


In [47]:
#unpivoting tables
df_yahoo_long = pd.melt(df_yahoo_2w, id_vars='Date', value_vars=df_yahoo_2w.columns.to_list())
df_yahoo_long

Unnamed: 0,Date,variable,value
0,2004-02-11,A,40.118027
1,2004-02-25,A,49.841611
2,2004-03-10,A,46.909871
3,2004-03-24,A,42.621603
4,2004-04-07,A,45.432761
...,...,...,...
842463,2021-12-15,ZTS,459.233753
842464,2021-12-29,ZTS,485.928567
842465,2022-01-12,ZTS,438.091248
842466,2022-01-26,ZTS,391.767140


In [48]:
#Merging tables
df_marged2 = pd.merge(df_marged, df_yahoo_long,  how='left', left_on=['symbol','Date'], right_on = ['variable','Date'])

df_marged2

Unnamed: 0,symbol,sector,score,Date,variable_x,value_x,variable_y,value_y
0,SU,Energy Minerals,0.953727,2004-02-11,SU,0.008349,SU,19.872500
1,GGG,Producer Manufacturing,0.952753,2004-02-11,GGG,0.011734,GGG,14.028889
2,WGR,Energy Minerals,0.947634,NaT,,,,
3,CWT,Utilities,0.934181,2004-02-11,CWT,0.004778,CWT,22.045000
4,BLL,Process Industries,0.922862,2004-02-11,BLL,-0.004917,BLL,12.162500
...,...,...,...,...,...,...,...,...
37355,PEP,Consumer Non-Durables,0.701507,2022-02-09,PEP,-0.003515,PEP,344.388752
37356,SSNC,Technology Services,0.701123,2022-02-09,SSNC,0.058040,SSNC,161.817498
37357,GEF,Process Industries,0.697954,2022-02-09,GEF,-0.023572,GEF,115.113750
37358,DPZ,Consumer Services,0.697741,2022-02-09,DPZ,0.063856,DPZ,887.071259


### Preparation of the target dataset

In [49]:
#Dropping columnd
df_marged2.drop(["variable_y", "variable_x"], inplace=True, axis=1)

#Replacing 0 with NaN values
df_marged2["value_y"].replace(0,np.nan, inplace=True)

#Dropping missing values
df_marged2=df_marged2.dropna()

#Sorting by date and symbol
df_marged2 = df_marged2.sort_values(by=["Date","symbol"], ascending=True)

#Changing column names
df_marged2.rename(columns = {'value_y':'close','value_x':'return_rate'}, inplace = True)

In [50]:
df_marged2

Unnamed: 0,symbol,sector,score,Date,return_rate,close
65,AEE,Utilities,0.670127,2004-02-11,0.002350,70.309999
40,AOS,Producer Manufacturing,0.753176,2004-02-11,0.007533,8.005000
5,APA,Energy Minerals,0.912117,2004-02-11,0.005808,59.630001
66,ARLP,Energy Minerals,0.669621,2004-02-11,-0.011510,13.578750
63,ATO,Utilities,0.672410,2004-02-11,0.000765,39.230000
...,...,...,...,...,...,...
37315,WGO,Consumer Durables,0.778997,2022-02-09,0.078665,133.997500
37309,WM,Industrial Services,0.802717,2022-02-09,0.002500,293.646246
37280,WSO,Producer Manufacturing,0.948063,2022-02-09,-0.002957,549.996265
37352,WSO,Producer Manufacturing,0.710300,2022-02-09,-0.002957,549.996265


### Download CSV

In [51]:
def create_download_link( df, title = "Download CSV file", filename = "data_rates.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df_marged2)