# More Data Cleaning

## Interest Rate - US Data

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
interest_rate_us = pd.read_csv('Resources/Datasets/INTDSRUSM193N.csv')
interest_rate_us

Unnamed: 0,DATE,INTDSRUSM193N
0,1/1/2001,5.52
1,2/1/2001,5.00
2,3/1/2001,4.81
3,4/1/2001,4.28
4,5/1/2001,3.73
...,...,...
243,4/1/2021,0.25
244,5/1/2021,0.25
245,6/1/2021,0.25
246,7/1/2021,0.25


In [3]:
interest_rate_us.columns

Index(['DATE', 'INTDSRUSM193N'], dtype='object')

In [4]:
interest_rate_us.dtypes

DATE              object
INTDSRUSM193N    float64
dtype: object

In [5]:
interest_rate_us["Date"] = pd.to_datetime(interest_rate_us["DATE"])
interest_rate_us.head()

Unnamed: 0,DATE,INTDSRUSM193N,Date
0,1/1/2001,5.52,2001-01-01
1,2/1/2001,5.0,2001-02-01
2,3/1/2001,4.81,2001-03-01
3,4/1/2001,4.28,2001-04-01
4,5/1/2001,3.73,2001-05-01


In [6]:
interest_rate_us= interest_rate_us.drop(["DATE"],axis=1)
interest_rate_us

Unnamed: 0,INTDSRUSM193N,Date
0,5.52,2001-01-01
1,5.00,2001-02-01
2,4.81,2001-03-01
3,4.28,2001-04-01
4,3.73,2001-05-01
...,...,...
243,0.25,2021-04-01
244,0.25,2021-05-01
245,0.25,2021-06-01
246,0.25,2021-07-01


In [7]:
interest_rate_us = pd.DataFrame(data=interest_rate_us, columns={'Date',"INTDSRUSM193N"})
interest_rate_us

Unnamed: 0,Date,INTDSRUSM193N
0,2001-01-01,5.52
1,2001-02-01,5.00
2,2001-03-01,4.81
3,2001-04-01,4.28
4,2001-05-01,3.73
...,...,...
243,2021-04-01,0.25
244,2021-05-01,0.25
245,2021-06-01,0.25
246,2021-07-01,0.25


In [8]:
interest_rate_us['Year'] = interest_rate_us['Date'].dt.year
interest_rate_us['Month'] = interest_rate_us['Date'].dt.month
interest_rate_us

Unnamed: 0,Date,INTDSRUSM193N,Year,Month
0,2001-01-01,5.52,2001,1
1,2001-02-01,5.00,2001,2
2,2001-03-01,4.81,2001,3
3,2001-04-01,4.28,2001,4
4,2001-05-01,3.73,2001,5
...,...,...,...,...
243,2021-04-01,0.25,2021,4
244,2021-05-01,0.25,2021,5
245,2021-06-01,0.25,2021,6
246,2021-07-01,0.25,2021,7


In [9]:
interest_rate_us= interest_rate_us.drop(["Date"],axis=1)
interest_rate_us

Unnamed: 0,INTDSRUSM193N,Year,Month
0,5.52,2001,1
1,5.00,2001,2
2,4.81,2001,3
3,4.28,2001,4
4,3.73,2001,5
...,...,...,...
243,0.25,2021,4
244,0.25,2021,5
245,0.25,2021,6
246,0.25,2021,7


In [10]:
interest_rate_us = pd.DataFrame(data=interest_rate_us, columns=['Year','Month',"INTDSRUSM193N"])
interest_rate_us

Unnamed: 0,Year,Month,INTDSRUSM193N
0,2001,1,5.52
1,2001,2,5.00
2,2001,3,4.81
3,2001,4,4.28
4,2001,5,3.73
...,...,...,...
243,2021,4,0.25
244,2021,5,0.25
245,2021,6,0.25
246,2021,7,0.25


In [11]:
interest_rate_us= interest_rate_us.rename(columns={"Year":"Year", "Month":"Month",
        "INTDSRUSM193N":"Interest_Rate(%)"})
interest_rate_us

Unnamed: 0,Year,Month,Interest_Rate(%)
0,2001,1,5.52
1,2001,2,5.00
2,2001,3,4.81
3,2001,4,4.28
4,2001,5,3.73
...,...,...,...
243,2021,4,0.25
244,2021,5,0.25
245,2021,6,0.25
246,2021,7,0.25


In [12]:
interest_rate_yearly = interest_rate_us.groupby(['Year']).mean()
interest_rate_yearly

Unnamed: 0_level_0,Month,Interest_Rate(%)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,6.5,3.41
2002,6.5,1.173333
2003,6.5,2.104167
2004,6.5,2.395833
2005,6.5,4.25
2006,6.5,6.020833
2007,6.5,5.791667
2008,6.5,2.166667
2009,6.5,0.5
2010,6.5,0.729167


In [13]:
interest_rate_yearly = interest_rate_yearly.reset_index()
interest_rate_yearly

Unnamed: 0,Year,Month,Interest_Rate(%)
0,2001,6.5,3.41
1,2002,6.5,1.173333
2,2003,6.5,2.104167
3,2004,6.5,2.395833
4,2005,6.5,4.25
5,2006,6.5,6.020833
6,2007,6.5,5.791667
7,2008,6.5,2.166667
8,2009,6.5,0.5
9,2010,6.5,0.729167


In [14]:
interest_rate_yearly = interest_rate_yearly.drop(columns='Month')
interest_rate_yearly

Unnamed: 0,Year,Interest_Rate(%)
0,2001,3.41
1,2002,1.173333
2,2003,2.104167
3,2004,2.395833
4,2005,4.25
5,2006,6.020833
6,2007,5.791667
7,2008,2.166667
8,2009,0.5
9,2010,0.729167


## Working population data

In [15]:
working_pop = pd.read_csv('Resources/Datasets/LFWA64TTUSM647S.csv')
working_pop

Unnamed: 0,DATE,LFWA64TTUSM647S
0,1/1/2001,180406010.5
1,2/1/2001,180564078.8
2,3/1/2001,180627181.6
3,4/1/2001,180903832.3
4,5/1/2001,181015684.9
...,...,...
251,12/1/2021,205161455.9
252,1/1/2022,207145172.9
253,2/1/2022,207124343.6
254,3/1/2022,206950351.2


In [16]:
working_pop["Date"] = pd.to_datetime(working_pop["DATE"])
working_pop.head()

Unnamed: 0,DATE,LFWA64TTUSM647S,Date
0,1/1/2001,180406010.5,2001-01-01
1,2/1/2001,180564078.8,2001-02-01
2,3/1/2001,180627181.6,2001-03-01
3,4/1/2001,180903832.3,2001-04-01
4,5/1/2001,181015684.9,2001-05-01


In [17]:
working_pop= working_pop.drop(["DATE"],axis=1)
working_pop

Unnamed: 0,LFWA64TTUSM647S,Date
0,180406010.5,2001-01-01
1,180564078.8,2001-02-01
2,180627181.6,2001-03-01
3,180903832.3,2001-04-01
4,181015684.9,2001-05-01
...,...,...
251,205161455.9,2021-12-01
252,207145172.9,2022-01-01
253,207124343.6,2022-02-01
254,206950351.2,2022-03-01


In [18]:
working_pop = pd.DataFrame(data=working_pop, columns={'Date',"LFWA64TTUSM647S"})
working_pop

Unnamed: 0,Date,LFWA64TTUSM647S
0,2001-01-01,180406010.5
1,2001-02-01,180564078.8
2,2001-03-01,180627181.6
3,2001-04-01,180903832.3
4,2001-05-01,181015684.9
...,...,...
251,2021-12-01,205161455.9
252,2022-01-01,207145172.9
253,2022-02-01,207124343.6
254,2022-03-01,206950351.2


In [19]:
working_pop['Year'] = working_pop['Date'].dt.year
working_pop['Month'] = working_pop['Date'].dt.month
working_pop

Unnamed: 0,Date,LFWA64TTUSM647S,Year,Month
0,2001-01-01,180406010.5,2001,1
1,2001-02-01,180564078.8,2001,2
2,2001-03-01,180627181.6,2001,3
3,2001-04-01,180903832.3,2001,4
4,2001-05-01,181015684.9,2001,5
...,...,...,...,...
251,2021-12-01,205161455.9,2021,12
252,2022-01-01,207145172.9,2022,1
253,2022-02-01,207124343.6,2022,2
254,2022-03-01,206950351.2,2022,3


In [20]:
working_pop= working_pop.drop(["Date"],axis=1)
working_pop

Unnamed: 0,LFWA64TTUSM647S,Year,Month
0,180406010.5,2001,1
1,180564078.8,2001,2
2,180627181.6,2001,3
3,180903832.3,2001,4
4,181015684.9,2001,5
...,...,...,...
251,205161455.9,2021,12
252,207145172.9,2022,1
253,207124343.6,2022,2
254,206950351.2,2022,3


In [21]:
working_pop.columns

Index(['LFWA64TTUSM647S', 'Year', 'Month'], dtype='object')

In [22]:
working_pop = pd.DataFrame(data=working_pop, columns=['Year','Month',"LFWA64TTUSM647S"])
working_pop

Unnamed: 0,Year,Month,LFWA64TTUSM647S
0,2001,1,180406010.5
1,2001,2,180564078.8
2,2001,3,180627181.6
3,2001,4,180903832.3
4,2001,5,181015684.9
...,...,...,...
251,2021,12,205161455.9
252,2022,1,207145172.9
253,2022,2,207124343.6
254,2022,3,206950351.2


In [23]:
working_pop= working_pop.rename(columns={"Year":"Year", "Month":"Month",
        "LFWA64TTUSM647S":"working_population"})
working_pop

Unnamed: 0,Year,Month,working_population
0,2001,1,180406010.5
1,2001,2,180564078.8
2,2001,3,180627181.6
3,2001,4,180903832.3
4,2001,5,181015684.9
...,...,...,...
251,2021,12,205161455.9
252,2022,1,207145172.9
253,2022,2,207124343.6
254,2022,3,206950351.2


In [24]:
working_pop_yearly = working_pop.groupby(['Year']).mean()
working_pop_yearly

Unnamed: 0_level_0,Month,working_population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,6.5,181476600.0
2002,6.5,183792700.0
2003,6.5,186939800.0
2004,6.5,188763100.0
2005,6.5,191025000.0
2006,6.5,193219400.0
2007,6.5,195663600.0
2008,6.5,196691500.0
2009,6.5,197897500.0
2010,6.5,199183800.0


In [25]:
working_pop_yearly= working_pop_yearly.reset_index()
working_pop_yearly

Unnamed: 0,Year,Month,working_population
0,2001,6.5,181476600.0
1,2002,6.5,183792700.0
2,2003,6.5,186939800.0
3,2004,6.5,188763100.0
4,2005,6.5,191025000.0
5,2006,6.5,193219400.0
6,2007,6.5,195663600.0
7,2008,6.5,196691500.0
8,2009,6.5,197897500.0
9,2010,6.5,199183800.0


In [26]:
working_pop_yearly = working_pop_yearly.drop(columns='Month')
working_pop_yearly

Unnamed: 0,Year,working_population
0,2001,181476600.0
1,2002,183792700.0
2,2003,186939800.0
3,2004,188763100.0
4,2005,191025000.0
5,2006,193219400.0
6,2007,195663600.0
7,2008,196691500.0
8,2009,197897500.0
9,2010,199183800.0


In [27]:
working_pop_yearly.dtypes

Year                    int64
working_population    float64
dtype: object

In [28]:
working_pop_yearly=working_pop_yearly.astype('int64')
working_pop_yearly

Unnamed: 0,Year,working_population
0,2001,181476647
1,2002,183792729
2,2003,186939817
3,2004,188763071
4,2005,191024953
5,2006,193219398
6,2007,195663562
7,2008,196691536
8,2009,197897475
9,2010,199183839


## Stock market to GDP (%) data

In [29]:
stock_to_GDP_percent = pd.read_csv('Resources/Datasets/DDDM01USA156NWDB.csv')
stock_to_GDP_percent

Unnamed: 0,DATE,DDDM01USA156NWDB
0,1/1/2001,132.148
1,1/1/2002,101.0791
2,1/1/2003,124.5066
3,1/1/2004,133.6506
4,1/1/2005,130.4083
5,1/1/2006,141.6542
6,1/1/2007,137.8527
7,1/1/2008,78.7766
8,1/1/2009,104.3488
9,1/1/2010,115.2841


In [30]:
stock_to_GDP_percent.loc[len(stock_to_GDP_percent.index)] = ['1/1/2020',194.490]
stock_to_GDP_percent

Unnamed: 0,DATE,DDDM01USA156NWDB
0,1/1/2001,132.148
1,1/1/2002,101.0791
2,1/1/2003,124.5066
3,1/1/2004,133.6506
4,1/1/2005,130.4083
5,1/1/2006,141.6542
6,1/1/2007,137.8527
7,1/1/2008,78.7766
8,1/1/2009,104.3488
9,1/1/2010,115.2841


In [31]:
stock_to_GDP_percent["Date"] = pd.to_datetime(stock_to_GDP_percent["DATE"])
stock_to_GDP_percent.head()

Unnamed: 0,DATE,DDDM01USA156NWDB,Date
0,1/1/2001,132.148,2001-01-01
1,1/1/2002,101.0791,2002-01-01
2,1/1/2003,124.5066,2003-01-01
3,1/1/2004,133.6506,2004-01-01
4,1/1/2005,130.4083,2005-01-01


In [32]:
stock_to_GDP_percent= stock_to_GDP_percent.drop(["DATE"],axis=1)
stock_to_GDP_percent

Unnamed: 0,DDDM01USA156NWDB,Date
0,132.148,2001-01-01
1,101.0791,2002-01-01
2,124.5066,2003-01-01
3,133.6506,2004-01-01
4,130.4083,2005-01-01
5,141.6542,2006-01-01
6,137.8527,2007-01-01
7,78.7766,2008-01-01
8,104.3488,2009-01-01
9,115.2841,2010-01-01


In [33]:
stock_to_GDP_percent = pd.DataFrame(data=stock_to_GDP_percent, columns={'Date',"DDDM01USA156NWDB"})
stock_to_GDP_percent

Unnamed: 0,Date,DDDM01USA156NWDB
0,2001-01-01,132.148
1,2002-01-01,101.0791
2,2003-01-01,124.5066
3,2004-01-01,133.6506
4,2005-01-01,130.4083
5,2006-01-01,141.6542
6,2007-01-01,137.8527
7,2008-01-01,78.7766
8,2009-01-01,104.3488
9,2010-01-01,115.2841


In [34]:
stock_to_GDP_percent['Year'] = stock_to_GDP_percent['Date'].dt.year
stock_to_GDP_percent['Month'] = stock_to_GDP_percent['Date'].dt.month
stock_to_GDP_percent

Unnamed: 0,Date,DDDM01USA156NWDB,Year,Month
0,2001-01-01,132.148,2001,1
1,2002-01-01,101.0791,2002,1
2,2003-01-01,124.5066,2003,1
3,2004-01-01,133.6506,2004,1
4,2005-01-01,130.4083,2005,1
5,2006-01-01,141.6542,2006,1
6,2007-01-01,137.8527,2007,1
7,2008-01-01,78.7766,2008,1
8,2009-01-01,104.3488,2009,1
9,2010-01-01,115.2841,2010,1


In [35]:
stock_to_GDP_percent= stock_to_GDP_percent.drop(["Date"],axis=1)
stock_to_GDP_percent

Unnamed: 0,DDDM01USA156NWDB,Year,Month
0,132.148,2001,1
1,101.0791,2002,1
2,124.5066,2003,1
3,133.6506,2004,1
4,130.4083,2005,1
5,141.6542,2006,1
6,137.8527,2007,1
7,78.7766,2008,1
8,104.3488,2009,1
9,115.2841,2010,1


In [36]:
stock_to_GDP_percent = pd.DataFrame(data=stock_to_GDP_percent, columns=['Year','Month',"DDDM01USA156NWDB"])
stock_to_GDP_percent

Unnamed: 0,Year,Month,DDDM01USA156NWDB
0,2001,1,132.148
1,2002,1,101.0791
2,2003,1,124.5066
3,2004,1,133.6506
4,2005,1,130.4083
5,2006,1,141.6542
6,2007,1,137.8527
7,2008,1,78.7766
8,2009,1,104.3488
9,2010,1,115.2841


In [37]:
stock_to_GDP_percent= stock_to_GDP_percent.rename(columns={"Year":"Year", "Month":"Month",
        "DDDM01USA156NWDB":"stock_to_GDP(%)"})
stock_to_GDP_percent

Unnamed: 0,Year,Month,stock_to_GDP(%)
0,2001,1,132.148
1,2002,1,101.0791
2,2003,1,124.5066
3,2004,1,133.6506
4,2005,1,130.4083
5,2006,1,141.6542
6,2007,1,137.8527
7,2008,1,78.7766
8,2009,1,104.3488
9,2010,1,115.2841


In [38]:
stock_to_GDP_percent_yearly = stock_to_GDP_percent.drop(columns='Month')
stock_to_GDP_percent_yearly

Unnamed: 0,Year,stock_to_GDP(%)
0,2001,132.148
1,2002,101.0791
2,2003,124.5066
3,2004,133.6506
4,2005,130.4083
5,2006,141.6542
6,2007,137.8527
7,2008,78.7766
8,2009,104.3488
9,2010,115.2841


## Inflation data

In [39]:
inflation = pd.read_csv('Resources/Datasets/FPCPITOTLZGUSA.csv')
inflation

Unnamed: 0,DATE,FPCPITOTLZGUSA
0,1/1/2001,2.826171
1,1/1/2002,1.586032
2,1/1/2003,2.270095
3,1/1/2004,2.677237
4,1/1/2005,3.392747
5,1/1/2006,3.225944
6,1/1/2007,2.852672
7,1/1/2008,3.8391
8,1/1/2009,-0.355546
9,1/1/2010,1.640043


In [40]:
inflation["Date"] = pd.to_datetime(inflation["DATE"])
inflation.head()

Unnamed: 0,DATE,FPCPITOTLZGUSA,Date
0,1/1/2001,2.826171,2001-01-01
1,1/1/2002,1.586032,2002-01-01
2,1/1/2003,2.270095,2003-01-01
3,1/1/2004,2.677237,2004-01-01
4,1/1/2005,3.392747,2005-01-01


In [41]:
inflation= inflation.drop(["DATE"],axis=1)
inflation

Unnamed: 0,FPCPITOTLZGUSA,Date
0,2.826171,2001-01-01
1,1.586032,2002-01-01
2,2.270095,2003-01-01
3,2.677237,2004-01-01
4,3.392747,2005-01-01
5,3.225944,2006-01-01
6,2.852672,2007-01-01
7,3.8391,2008-01-01
8,-0.355546,2009-01-01
9,1.640043,2010-01-01


In [42]:
inflation = pd.DataFrame(data=inflation, columns={'Date',"FPCPITOTLZGUSA"})
inflation

Unnamed: 0,Date,FPCPITOTLZGUSA
0,2001-01-01,2.826171
1,2002-01-01,1.586032
2,2003-01-01,2.270095
3,2004-01-01,2.677237
4,2005-01-01,3.392747
5,2006-01-01,3.225944
6,2007-01-01,2.852672
7,2008-01-01,3.8391
8,2009-01-01,-0.355546
9,2010-01-01,1.640043


In [43]:
inflation['Year'] = inflation['Date'].dt.year
inflation['Month'] = inflation['Date'].dt.month
inflation

Unnamed: 0,Date,FPCPITOTLZGUSA,Year,Month
0,2001-01-01,2.826171,2001,1
1,2002-01-01,1.586032,2002,1
2,2003-01-01,2.270095,2003,1
3,2004-01-01,2.677237,2004,1
4,2005-01-01,3.392747,2005,1
5,2006-01-01,3.225944,2006,1
6,2007-01-01,2.852672,2007,1
7,2008-01-01,3.8391,2008,1
8,2009-01-01,-0.355546,2009,1
9,2010-01-01,1.640043,2010,1


In [44]:
inflation= inflation.drop(["Date"],axis=1)
inflation

Unnamed: 0,FPCPITOTLZGUSA,Year,Month
0,2.826171,2001,1
1,1.586032,2002,1
2,2.270095,2003,1
3,2.677237,2004,1
4,3.392747,2005,1
5,3.225944,2006,1
6,2.852672,2007,1
7,3.8391,2008,1
8,-0.355546,2009,1
9,1.640043,2010,1


In [45]:
inflation = pd.DataFrame(data=inflation, columns=['Year','Month',"FPCPITOTLZGUSA"])
inflation

Unnamed: 0,Year,Month,FPCPITOTLZGUSA
0,2001,1,2.826171
1,2002,1,1.586032
2,2003,1,2.270095
3,2004,1,2.677237
4,2005,1,3.392747
5,2006,1,3.225944
6,2007,1,2.852672
7,2008,1,3.8391
8,2009,1,-0.355546
9,2010,1,1.640043


In [46]:
inflation= inflation.rename(columns={"Year":"Year", "Month":"Month",
        "FPCPITOTLZGUSA":"inflation(%)"})
inflation

Unnamed: 0,Year,Month,inflation(%)
0,2001,1,2.826171
1,2002,1,1.586032
2,2003,1,2.270095
3,2004,1,2.677237
4,2005,1,3.392747
5,2006,1,3.225944
6,2007,1,2.852672
7,2008,1,3.8391
8,2009,1,-0.355546
9,2010,1,1.640043


In [47]:
inflation_yearly = inflation.drop(columns='Month')
inflation_yearly

Unnamed: 0,Year,inflation(%)
0,2001,2.826171
1,2002,1.586032
2,2003,2.270095
3,2004,2.677237
4,2005,3.392747
5,2006,3.225944
6,2007,2.852672
7,2008,3.8391
8,2009,-0.355546
9,2010,1.640043


In [48]:
df_merged_1 = inflation_yearly.merge(stock_to_GDP_percent_yearly, how='inner', on=['Year'])
df_merged_1

Unnamed: 0,Year,inflation(%),stock_to_GDP(%)
0,2001,2.826171,132.148
1,2002,1.586032,101.0791
2,2003,2.270095,124.5066
3,2004,2.677237,133.6506
4,2005,3.392747,130.4083
5,2006,3.225944,141.6542
6,2007,2.852672,137.8527
7,2008,3.8391,78.7766
8,2009,-0.355546,104.3488
9,2010,1.640043,115.2841


In [49]:
df_merged_2 = df_merged_1.merge(working_pop_yearly, how='inner', on=['Year'])
df_merged_2

Unnamed: 0,Year,inflation(%),stock_to_GDP(%),working_population
0,2001,2.826171,132.148,181476647
1,2002,1.586032,101.0791,183792729
2,2003,2.270095,124.5066,186939817
3,2004,2.677237,133.6506,188763071
4,2005,3.392747,130.4083,191024953
5,2006,3.225944,141.6542,193219398
6,2007,2.852672,137.8527,195663562
7,2008,3.8391,78.7766,196691536
8,2009,-0.355546,104.3488,197897475
9,2010,1.640043,115.2841,199183839


In [50]:
df_merged = df_merged_2.merge(interest_rate_yearly, how='inner', on='Year')
df_merged

Unnamed: 0,Year,inflation(%),stock_to_GDP(%),working_population,Interest_Rate(%)
0,2001,2.826171,132.148,181476647,3.41
1,2002,1.586032,101.0791,183792729,1.173333
2,2003,2.270095,124.5066,186939817,2.104167
3,2004,2.677237,133.6506,188763071,2.395833
4,2005,3.392747,130.4083,191024953,4.25
5,2006,3.225944,141.6542,193219398,6.020833
6,2007,2.852672,137.8527,195663562,5.791667
7,2008,3.8391,78.7766,196691536,2.166667
8,2009,-0.355546,104.3488,197897475,0.5
9,2010,1.640043,115.2841,199183839,0.729167


In [51]:
df_merged.to_csv('Resources/cleaned_data/4_more_factors.csv', index=False)
