In [4]:
import pandas as pd

## Timeline Data

In [5]:
timeline = pd.read_excel('RawData/timeline_data.xlsx')
timeline

Unnamed: 0,Year,Date,Description
0,2010,January 12,"A 7.0 magnitude earthquake in Haiti kills 230,..."
1,2010,February 18,2010 Nigerien coup d'état.
2,2010,March 29,2010 Moscow Metro bombings.
3,2010,April 10,"The President of Poland, Lech Kaczyński, is am..."
4,2010,April 20,The largest oil spill in US history occurs in ...
...,...,...,...
173,2021,October 6,The World Health Organization endorses the fir...
174,2021,November 16,Russia draws international condemnation follow...
175,2021,November 30,Barbados becomes a republic on its 55th annive...
176,2021,December 9,"A truck crash in Chiapas, Mexico, kills 55 mig..."


## Financial Data
#### As the unit of currency is different for all countries, to ensure all currencies is on a similar/same scale we use SDR values.

In [6]:
data = pd.read_csv('RawData/cleaned_exchange_rate.csv', index_col=0)
data

Unnamed: 0_level_0,Algeria,Australia,Botswana,Brazil,Brunei,Canada,Chile,China,Colombia,Czech Republic,...,Singapore,South Africa,Sweden,Switzerland,Thailand,Trinidadian,U.A.E.,U.K.,U.S.,Uruguay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1/4/16,0.006710,0.520544,0.064140,0.184561,0.506341,0.515910,0.001015,0.110659,0.000229,0.029066,...,0.506341,0.046369,0.085704,0.721685,0.019929,0.112887,0.196236,1.064440,0.720675,0.024011
1/5/16,0.006723,0.522630,0.064223,0.179481,0.508786,0.518022,0.001011,0.111183,0.000226,0.028827,...,0.508786,0.045834,0.085161,0.717691,0.020065,0.112887,0.197377,1.063310,0.724868,0.024011
1/6/16,0.006723,0.516299,0.063713,0.180693,0.506380,0.514650,0.001014,0.110623,0.000226,0.028768,...,0.506380,0.045834,0.084140,0.718581,0.020014,0.112829,0.197368,1.062820,0.724833,0.023704
1/7/16,0.006725,0.508623,0.063135,0.179236,0.503045,0.513742,0.001001,0.109555,0.000222,0.029042,...,0.503045,0.044869,0.084140,0.722156,0.019900,0.112381,0.196698,1.052210,0.722373,0.023704
1/8/16,0.006717,0.510064,0.063352,0.178473,0.503849,0.510471,0.001001,0.109636,0.000220,0.029042,...,0.503849,0.045201,0.085012,0.722946,0.019936,0.112734,0.196697,1.054800,0.722368,0.023704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/17/21,0.005148,0.512476,0.060994,0.125551,0.523769,0.556632,0.000842,0.112212,0.000179,0.032066,...,0.523769,0.045137,0.079100,0.777566,0.021421,0.105541,0.195082,0.950087,0.715050,0.016118
12/20/21,0.005150,0.509316,0.060897,0.125583,0.523903,0.553576,0.000846,0.112324,0.000179,0.032014,...,0.523903,0.045197,0.078342,0.775660,0.021391,0.105821,0.195082,0.945806,0.716438,0.016159
12/21/21,0.005151,0.509225,0.061210,0.124794,0.524090,0.553808,0.000829,0.112359,0.000179,0.032035,...,0.524090,0.045170,0.078409,0.776261,0.021255,0.105660,0.194937,0.948326,0.715907,0.016121
12/22/21,0.005145,0.510355,0.060888,0.125096,0.524203,0.556148,0.000824,0.112259,0.000179,0.032027,...,0.524203,0.045058,0.078187,0.775299,0.021188,0.105660,0.194822,0.952668,0.715485,0.016121


In [7]:
data.isnull().sum()

Algeria           1
Australia         0
Botswana          0
Brazil            0
Brunei            0
Canada            1
Chile             0
China             0
Colombia          0
Czech Republic    0
Denmark           0
EUR               0
India             0
Israel            0
Japan             0
Korean            0
Kuwait            0
Malaysia          0
Mauritius         0
Mexico            0
New Zealand       0
Norway            0
Oman              0
Peru              1
Philippines       0
Poland            0
Qatar             0
Russia            0
Saudi Arabia      1
Singapore         0
South Africa      0
Sweden            0
Switzerland       0
Thailand          0
Trinidadian       1
U.A.E.            0
U.K.              0
U.S.              0
Uruguay           0
dtype: int64

In [8]:
# Filter Methods: These methods assess the relevance of features based on statistical measures and are computationally efficient.
# Correlation Analysis: Compute pairwise correlations between features and remove highly correlated features to reduce redundancy.
# Variance Thresholding: Remove features with low variance, as they might not provide much information.

In [9]:
columns = data.columns
correlation_matrix = data[columns].corr()
correlation_matrix

Unnamed: 0,Algeria,Australia,Botswana,Brazil,Brunei,Canada,Chile,China,Colombia,Czech Republic,...,Singapore,South Africa,Sweden,Switzerland,Thailand,Trinidadian,U.A.E.,U.K.,U.S.,Uruguay
Algeria,1.0,0.342581,0.61834,0.895926,0.027727,-0.148533,0.704479,-0.006304,0.837978,-0.575713,...,0.025774,0.492556,0.245088,-0.687243,-0.470424,0.626312,0.521776,-0.023084,0.521876,0.900862
Australia,0.342581,1.0,0.563456,0.564856,0.044198,0.526395,0.657443,0.476546,0.570394,0.00962,...,0.041908,0.650095,0.797956,-0.206835,-0.56883,-0.216156,-0.250178,0.172197,-0.251127,0.582719
Botswana,0.61834,0.563456,1.0,0.837592,0.380042,0.243956,0.877022,0.112075,0.855699,0.127805,...,0.379327,0.966714,0.250604,-0.678771,-0.169115,-0.038106,0.058977,-0.152148,0.058861,0.81543
Brazil,0.895926,0.564856,0.837592,1.0,0.202294,0.124549,0.879778,0.109094,0.939439,-0.285928,...,0.200823,0.756859,0.374553,-0.735998,-0.460422,0.314392,0.300429,-0.07953,0.299955,0.97297
Brunei,0.027727,0.044198,0.380042,0.202294,1.0,0.330376,0.205675,-0.128344,0.225932,0.248062,...,0.998333,0.36006,-0.256553,-0.236432,0.415599,-0.122369,0.046814,-0.204081,0.046566,0.085392
Canada,-0.148533,0.526395,0.243956,0.124549,0.330376,1.0,0.156038,0.251185,0.079935,0.330991,...,0.331729,0.363367,0.333482,0.090858,-0.100838,-0.331478,-0.205942,0.064723,-0.20638,0.03002
Chile,0.704479,0.657443,0.877022,0.879778,0.205675,0.156038,1.0,0.210706,0.920745,-0.028313,...,0.204306,0.825011,0.430401,-0.778895,-0.38521,0.034095,0.019252,-0.016113,0.018515,0.892543
China,-0.006304,0.476546,0.112075,0.109094,-0.128344,0.251185,0.210706,1.0,0.132616,0.051518,...,-0.130001,0.277242,0.570023,-0.119112,-0.620049,-0.220628,-0.422932,0.616343,-0.424023,0.168668
Colombia,0.837978,0.570394,0.855699,0.939439,0.225932,0.079935,0.920745,0.132616,1.0,-0.20334,...,0.224164,0.773614,0.359871,-0.782551,-0.422979,0.22181,0.202951,-0.031212,0.202619,0.950748
Czech Republic,-0.575713,0.00962,0.127805,-0.285928,0.248062,0.330991,-0.028313,0.051518,-0.20334,1.0,...,0.248803,0.206961,0.023438,0.090706,0.404886,-0.818626,-0.701665,0.046476,-0.701842,-0.302852


In [10]:
data

Unnamed: 0_level_0,Algeria,Australia,Botswana,Brazil,Brunei,Canada,Chile,China,Colombia,Czech Republic,...,Singapore,South Africa,Sweden,Switzerland,Thailand,Trinidadian,U.A.E.,U.K.,U.S.,Uruguay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1/4/16,0.006710,0.520544,0.064140,0.184561,0.506341,0.515910,0.001015,0.110659,0.000229,0.029066,...,0.506341,0.046369,0.085704,0.721685,0.019929,0.112887,0.196236,1.064440,0.720675,0.024011
1/5/16,0.006723,0.522630,0.064223,0.179481,0.508786,0.518022,0.001011,0.111183,0.000226,0.028827,...,0.508786,0.045834,0.085161,0.717691,0.020065,0.112887,0.197377,1.063310,0.724868,0.024011
1/6/16,0.006723,0.516299,0.063713,0.180693,0.506380,0.514650,0.001014,0.110623,0.000226,0.028768,...,0.506380,0.045834,0.084140,0.718581,0.020014,0.112829,0.197368,1.062820,0.724833,0.023704
1/7/16,0.006725,0.508623,0.063135,0.179236,0.503045,0.513742,0.001001,0.109555,0.000222,0.029042,...,0.503045,0.044869,0.084140,0.722156,0.019900,0.112381,0.196698,1.052210,0.722373,0.023704
1/8/16,0.006717,0.510064,0.063352,0.178473,0.503849,0.510471,0.001001,0.109636,0.000220,0.029042,...,0.503849,0.045201,0.085012,0.722946,0.019936,0.112734,0.196697,1.054800,0.722368,0.023704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/17/21,0.005148,0.512476,0.060994,0.125551,0.523769,0.556632,0.000842,0.112212,0.000179,0.032066,...,0.523769,0.045137,0.079100,0.777566,0.021421,0.105541,0.195082,0.950087,0.715050,0.016118
12/20/21,0.005150,0.509316,0.060897,0.125583,0.523903,0.553576,0.000846,0.112324,0.000179,0.032014,...,0.523903,0.045197,0.078342,0.775660,0.021391,0.105821,0.195082,0.945806,0.716438,0.016159
12/21/21,0.005151,0.509225,0.061210,0.124794,0.524090,0.553808,0.000829,0.112359,0.000179,0.032035,...,0.524090,0.045170,0.078409,0.776261,0.021255,0.105660,0.194937,0.948326,0.715907,0.016121
12/22/21,0.005145,0.510355,0.060888,0.125096,0.524203,0.556148,0.000824,0.112259,0.000179,0.032027,...,0.524203,0.045058,0.078187,0.775299,0.021188,0.105660,0.194822,0.952668,0.715485,0.016121


In [11]:
from sklearn.feature_selection import VarianceThreshold

threshold_value = 0.0000001
selector = VarianceThreshold(threshold=threshold_value)

selector.fit(data)
selected_features = selector.get_support(indices=True)
selected_columns = data.columns[selected_features]
data_selected = selector.transform(data)
data_selected_df = pd.DataFrame(data_selected, columns=selected_columns)

print("Selected Features:")
# print(data_selected_df.head())
data_selected_df.head()

Selected Features:


Unnamed: 0,Algeria,Australia,Botswana,Brazil,Brunei,Canada,China,Czech Republic,Denmark,EUR,...,Singapore,South Africa,Sweden,Switzerland,Thailand,Trinidadian,U.A.E.,U.K.,U.S.,Uruguay
0,0.00671,0.520544,0.06414,0.184561,0.506341,0.51591,0.110659,0.029066,0.105253,0.785392,...,0.506341,0.046369,0.085704,0.721685,0.019929,0.112887,0.196236,1.06444,0.720675,0.024011
1,0.006723,0.52263,0.064223,0.179481,0.508786,0.518022,0.111183,0.028827,0.104409,0.778943,...,0.508786,0.045834,0.085161,0.717691,0.020065,0.112887,0.197377,1.06331,0.724868,0.024011
2,0.006723,0.516299,0.063713,0.180693,0.50638,0.51465,0.110623,0.028768,0.104368,0.778616,...,0.50638,0.045834,0.08414,0.718581,0.020014,0.112829,0.197368,1.06282,0.724833,0.023704
3,0.006725,0.508623,0.063135,0.179236,0.503045,0.513742,0.109555,0.029042,0.105244,0.785075,...,0.503045,0.044869,0.08414,0.722156,0.0199,0.112381,0.196698,1.05221,0.722373,0.023704
4,0.006717,0.510064,0.063352,0.178473,0.503849,0.510471,0.109636,0.029042,0.105173,0.784564,...,0.503849,0.045201,0.085012,0.722946,0.019936,0.112734,0.196697,1.0548,0.722368,0.023704


In [12]:
data_selected_df.columns

Index(['Algeria', 'Australia', 'Botswana', 'Brazil', 'Brunei', 'Canada',
       'China', 'Czech Republic', 'Denmark', 'EUR', 'India', 'Israel',
       'Kuwait', 'Malaysia', 'Mauritius', 'Mexico', 'New Zealand', 'Norway',
       'Oman', 'Peru', 'Philippines', 'Poland', 'Qatar', 'Russia',
       'Saudi Arabia', 'Singapore', 'South Africa', 'Sweden', 'Switzerland',
       'Thailand', 'Trinidadian', 'U.A.E.', 'U.K.', 'U.S.', 'Uruguay'],
      dtype='object')

## ------------------------------------Time Series Data filtering------------------------------------

In [13]:
oldData = pd.read_excel('RawData/timeline_data.xlsx')
newData = pd.read_excel('RawData/newTimeLine.xlsx')
newData

Unnamed: 0,Events
0,7.0 magnitude earthquake in Haiti - Date: Janu...
1,"2010 Nigerien coup d'état - Date: February 18,..."
2,"2010 Moscow Metro bombings - Date: March 29, 2..."
3,"Plane crash in Smolensk, Russia - Date: April ..."
4,"Deepwater Horizon oil spill - Date: April 20, ..."
...,...
173,The World Health Organization endorses the fir...
174,Russia draws international condemnation follow...
175,Barbados becomes a republic on its 55th annive...
176,"A truck crash in Chiapas, Mexico, kills 55 mig..."


In [14]:
# months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
ls = []
for i in newData['Events']:
    print(i.split("Date")[1].split("Countr")[0])

: January 12, 2010. 
: February 18, 2010. 
: March 29, 2010. 
: April 10, 2010. 
: April 20, 2010. 
: May 31, 2010. 
: July 22, 2010. 
: November 23, 2010. 
: February 22, 2011. 
: March 11, 2011. 
: May 2, 2011. 
: July 9, 2011. 
: July 22, 2011. 
s: August 6-10, 2011. 
: October 20, 2011. 
s: December 2010 - ongoing. 
: May 7, 2012. 
: July 20, 2012. 
: September 11–12, 2012. 
: October 14, 2012. 
: October 29–30, 2012. 
: November 6, 2012. 
: November 15, 2012. 
: December 8, 2012. 
: December 3, 2012. 
: December 14, 2012. 
: February 15, 2013. 
: February 28, 2013. 
: March 5, 2013. 
: April 24, 2013. 
: May 22, 2013. 
: July 1, 2013. 
: July 3, 2013. 
: July 22, 2013. 
: August 21, 2013. 
: September 9–28, 2013. 
: September 17, 2013. 
: October 15, 2013. 
: November 2–11, 2013. 
: November 21, 2013. 
: December 5, 2013. 
s: November 2013 - ongoing. 
: March 8, 2014. 
: April 16, 2014. 
: May 22, 2014. 
: June 19, 2014. 
: August 9, 2014. 
: November 12, 2014. 
: December 16, 201

In [15]:
import re

pattern = r'\b(?:Country|Countries|Region|Regions|Planet)\b'
date = r'\b(?:Date|Dates)\b'
date_pattern = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s(?:\d{1,2},?\s)?(?:\d{4}|\d{4}\s?-\s?ongoing|\d{4}\s?-\s?present|\d{4}\s?-\s?\d{4}|January|February|March|April|May|June|July|August|September|October|November|December\s\d{4})'

ls = []
for i in newData['Events']:
    ls.append(re.split(f'{date}: ', i)[1])

country = []
date_Country = []
for j in ls:
    date_Country.append(re.findall(date_pattern, j))
    country.append(re.split(f'{pattern}: ', j)[1][:-1])

for i in range(len(date_Country)):
    if(len(date_Country[i]) > 0):
        date_Country[i] = date_Country[i][0]
    else:
        date_Country[i] = ""

In [16]:
pd.DataFrame({"Country": country, "Date": date_Country})

Unnamed: 0,Country,Date
0,Haiti,"January 12, 2010"
1,Niger,"February 18, 2010"
2,Russia,"March 29, 2010"
3,Russia,"April 10, 2010"
4,United States (Gulf of Mexico),"April 20, 2010"
...,...,...
173,International,"October 6, 2021"
174,Russia,"November 15, 2021"
175,Barbados,"November 30, 2021"
176,Mexico,"December 8, 2021"


In [17]:
oldData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         178 non-null    int64 
 1   Date         178 non-null    object
 2   Description  178 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.3+ KB


In [18]:
for i in range(177):
    if(i < len(country)-1):
        continue
    else:
        country.append("")

In [19]:
len(country)

178

In [20]:
country

['Haiti',
 'Niger',
 'Russia',
 'Russia',
 'United States (Gulf of Mexico)',
 'International waters (Gaza Strip)',
 'Colombia and Venezuela',
 'South Korea (Yeonpyeong Island)',
 'New Zealand (Christchurch)',
 'Japan',
 'Pakistan',
 'South Sudan',
 'Norway',
 'United Kingdom',
 'Libya',
 'Tunisia, Egypt, Libya, Yemen, Bahrain, and others',
 'Russia',
 'United States (Aurora, Colorado)',
 'Libya',
 'United States (jump took place over New Mexico)',
 'United States, Caribbean',
 'United States',
 'China',
 'Qatar (conference held in Doha)',
 'Philippines',
 'United States (Newtown, Connecticut)',
 'Russia',
 'Vatican City',
 'Venezuela',
 'Bangladesh',
 'United Kingdom (Woolwich)',
 'Croatia',
 'Egypt',
 'United Kingdom',
 'Syria',
 'Philippines',
 'International release',
 'Philippines',
 'Philippines, Vietnam',
 'Ukraine',
 'South Africa',
 'Ukraine, Russia',
 'International waters (disappeared en route from Kuala Lumpur to Beijing)',
 'South Korea',
 'Thailand',
 'Spain',
 'United Sta

In [21]:
oldData['FullDate'] = [pd.to_datetime(f"{date}, {year}") if "–" not in f"{date}, {year}" else "" for date,year in zip(oldData['Date'], oldData['Year'])]

In [22]:
oldData['Country'] = country

In [23]:
oldData

Unnamed: 0,Year,Date,Description,FullDate,Country
0,2010,January 12,"A 7.0 magnitude earthquake in Haiti kills 230,...",2010-01-12 00:00:00,Haiti
1,2010,February 18,2010 Nigerien coup d'état.,2010-02-18 00:00:00,Niger
2,2010,March 29,2010 Moscow Metro bombings.,2010-03-29 00:00:00,Russia
3,2010,April 10,"The President of Poland, Lech Kaczyński, is am...",2010-04-10 00:00:00,Russia
4,2010,April 20,The largest oil spill in US history occurs in ...,2010-04-20 00:00:00,United States (Gulf of Mexico)
...,...,...,...,...,...
173,2021,October 6,The World Health Organization endorses the fir...,2021-10-06 00:00:00,International
174,2021,November 16,Russia draws international condemnation follow...,2021-11-16 00:00:00,Russia
175,2021,November 30,Barbados becomes a republic on its 55th annive...,2021-11-30 00:00:00,Barbados
176,2021,December 9,"A truck crash in Chiapas, Mexico, kills 55 mig...",2021-12-09 00:00:00,Mexico


In [21]:
oldData.to_csv('RawData/updatedTimeline.csv', index=False)

In [22]:
Unique_Countries = []
for i in oldData['Country'].unique():
    if i.find('(') != -1:
        Unique_Countries.append(i[:i.find('(')].strip())

Unique_Countries = list(set(Unique_Countries))
Unique_Countries

['North Korea',
 'Turkey',
 'United States',
 'International',
 'China',
 'South Korea',
 'Lebanon',
 'Austria',
 'Philippines',
 'Qatar',
 'International waters',
 'North America',
 'France',
 'Iraq',
 'United Kingdom',
 'New Zealand',
 'Falkland Islands']

In [24]:
countries = {"Austria": "EUR", "France": "EUR", "North America": "U.S.", "South Korea": "Korean", "United Kingdom": "U.K.", "United States": "U.S.", "Vatican City": "EUR", "Ukraine": "EUR", "Spain": "EUR", "El Salvador": "U.S.", "Portugal": "EUR", "Germany": "EUR", "Europe": "EUR"}
    
for index, row in oldData.iterrows():
    i = row['Country']
    if i.find('(') != -1:
        i = i[:i.find('(')].strip()
    if i in countries:
        oldData.at[index, 'Country'] = countries[i]
    else:
        if "," in i:
            sub_countries = i.split(",")
            for idx, val in enumerate(sub_countries):
                strip_val = val.strip()
                if strip_val in countries:
                    sub_countries[idx] = countries[strip_val]
                else:
                    sub_countries[idx] = strip_val
            oldData.at[index, 'Country'] = ",".join(sub_countries)
        else:
            oldData.at[index, 'Country'] = i

oldData

Unnamed: 0,Year,Date,Description,FullDate,Country
0,2010,January 12,"A 7.0 magnitude earthquake in Haiti kills 230,...",2010-01-12 00:00:00,Haiti
1,2010,February 18,2010 Nigerien coup d'état.,2010-02-18 00:00:00,Niger
2,2010,March 29,2010 Moscow Metro bombings.,2010-03-29 00:00:00,Russia
3,2010,April 10,"The President of Poland, Lech Kaczyński, is am...",2010-04-10 00:00:00,Russia
4,2010,April 20,The largest oil spill in US history occurs in ...,2010-04-20 00:00:00,U.S.
...,...,...,...,...,...
173,2021,October 6,The World Health Organization endorses the fir...,2021-10-06 00:00:00,International
174,2021,November 16,Russia draws international condemnation follow...,2021-11-16 00:00:00,Russia
175,2021,November 30,Barbados becomes a republic on its 55th annive...,2021-11-30 00:00:00,Barbados
176,2021,December 9,"A truck crash in Chiapas, Mexico, kills 55 mig...",2021-12-09 00:00:00,Mexico


In [25]:
oldData['Country'].unique()

array(['Haiti', 'Niger', 'Russia', 'U.S.', 'International waters',
       'Colombia and Venezuela', 'Korean', 'New Zealand', 'Japan',
       'Pakistan', 'South Sudan', 'Norway', 'U.K.', 'Libya',
       'Tunisia,Egypt,Libya,Yemen,Bahrain,and others', 'U.S.,Caribbean',
       'China', 'Qatar', 'Philippines', 'EUR', 'Venezuela', 'Bangladesh',
       'Croatia', 'Egypt', 'Syria', 'International release',
       'Philippines,Vietnam', 'South Africa', 'EUR,Russia', 'Thailand',
       'International', 'Russia,Kazakhstan,Belarus,Armenia,Kyrgyzstan',
       'Nigeria', 'Kenya', 'Cuba', 'Switzerland,Italy', 'Malaysia',
       'Mexico', 'Somalia', 'Singapore,North Korea', 'Saudi Arabia',
       'Turkey', 'India', 'International Space Station', 'Australia',
       'Iraq', 'Iran', 'North Korea', 'Bulgaria', 'Lebanon', 'Africa',
       'Armenia,Azerbaijan', 'Kyrgyzstan', 'Switzerland',
       'Falkland Islands', 'Senegal,EUR', 'Ethiopia', 'Asia-Pacific',
       'Nepal,China', 'Mars', 'Democratic Repub

In [26]:
event_Countries = []

for i in oldData['Country'].unique():
    if "," in i:
        for j in i.split(","):
            event_Countries.append(j)
    else:
        event_Countries.append(i)
event_Countries = list(set(event_Countries))
print(event_Countries)

['Niger', 'Pakistan', 'Iran', 'Kazakhstan', 'Africa', 'U.S.', 'Philippines', 'International Space Station', 'Mars', 'Korean', 'North Korea', 'Libya', 'China', 'Afghanistan', 'Azerbaijan', 'Italy', 'Jupiter', 'Yemen', 'Colombia and Venezuela', 'South Sudan', 'Senegal', 'Belarus', 'Iraq', 'Kenya', 'Democratic Republic of the Congo', 'Bulgaria', 'Tunisia', 'Caribbean', 'Vietnam', 'International release', 'South Africa', 'Asia-Pacific', 'Syria', 'International', 'Kyrgyzstan', 'Israel', 'Bahrain', 'Bangladesh', 'Croatia', 'EUR', 'Venezuela', 'Falkland Islands', 'Cuba', 'Switzerland', 'Ethiopia', 'Chad', 'Haiti', 'Barbados', 'Nigeria', 'U.K.', 'Somalia', 'Singapore', 'New Zealand', 'Saudi Arabia', 'Thailand', 'Turkey', 'Malaysia', 'Mexico', 'Japan', 'Egypt', 'and others', 'Lebanon', 'Qatar', 'Indonesia', 'India', 'Australia', 'Russia', 'Nepal', 'Colombia', 'International waters', 'Tajikistan', 'Armenia', 'Norway']


In [27]:
oldData.to_csv('RawData/updatedTimeline.csv', index=False)

### ------------------------------------Time Series Filtering Ended------------------------------------

In [28]:
data_selected_df

Unnamed: 0,Algeria,Australia,Botswana,Brazil,Brunei,Canada,China,Czech Republic,Denmark,EUR,...,Singapore,South Africa,Sweden,Switzerland,Thailand,Trinidadian,U.A.E.,U.K.,U.S.,Uruguay
0,0.006710,0.520544,0.064140,0.184561,0.506341,0.515910,0.110659,0.029066,0.105253,0.785392,...,0.506341,0.046369,0.085704,0.721685,0.019929,0.112887,0.196236,1.064440,0.720675,0.024011
1,0.006723,0.522630,0.064223,0.179481,0.508786,0.518022,0.111183,0.028827,0.104409,0.778943,...,0.508786,0.045834,0.085161,0.717691,0.020065,0.112887,0.197377,1.063310,0.724868,0.024011
2,0.006723,0.516299,0.063713,0.180693,0.506380,0.514650,0.110623,0.028768,0.104368,0.778616,...,0.506380,0.045834,0.084140,0.718581,0.020014,0.112829,0.197368,1.062820,0.724833,0.023704
3,0.006725,0.508623,0.063135,0.179236,0.503045,0.513742,0.109555,0.029042,0.105244,0.785075,...,0.503045,0.044869,0.084140,0.722156,0.019900,0.112381,0.196698,1.052210,0.722373,0.023704
4,0.006717,0.510064,0.063352,0.178473,0.503849,0.510471,0.109636,0.029042,0.105173,0.784564,...,0.503849,0.045201,0.085012,0.722946,0.019936,0.112734,0.196697,1.054800,0.722368,0.023704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,0.005148,0.512476,0.060994,0.125551,0.523769,0.556632,0.112212,0.032066,0.108943,0.810152,...,0.523769,0.045137,0.079100,0.777566,0.021421,0.105541,0.195082,0.950087,0.715050,0.016118
1481,0.005150,0.509316,0.060897,0.125583,0.523903,0.553576,0.112324,0.032014,0.108607,0.807641,...,0.523903,0.045197,0.078342,0.775660,0.021391,0.105821,0.195082,0.945806,0.716438,0.016159
1482,0.005151,0.509225,0.061210,0.124794,0.524090,0.553808,0.112359,0.032035,0.108741,0.808617,...,0.524090,0.045170,0.078409,0.776261,0.021255,0.105660,0.194937,0.948326,0.715907,0.016121
1483,0.005145,0.510355,0.060888,0.125096,0.524203,0.556148,0.112259,0.032027,0.108735,0.808570,...,0.524203,0.045058,0.078187,0.775299,0.021188,0.105660,0.194822,0.952668,0.715485,0.016121


In [29]:
final_selected_features = []

for i in data_selected_df:
    if i in event_Countries:
        final_selected_features.append(i)
print(len(final_selected_features))

19


In [30]:
print(final_selected_features)

['Australia', 'China', 'EUR', 'India', 'Israel', 'Malaysia', 'Mexico', 'New Zealand', 'Norway', 'Philippines', 'Qatar', 'Russia', 'Saudi Arabia', 'Singapore', 'South Africa', 'Switzerland', 'Thailand', 'U.K.', 'U.S.']


In [31]:
final_df = data[final_selected_features]
final_df

Unnamed: 0_level_0,Australia,China,EUR,India,Israel,Malaysia,Mexico,New Zealand,Norway,Philippines,Qatar,Russia,Saudi Arabia,Singapore,South Africa,Switzerland,Thailand,U.K.,U.S.
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1/4/16,0.520544,0.110659,0.785392,0.010843,0.184175,0.166688,0.041530,0.489286,0.081409,0.015295,0.197988,0.009882,0.192180,0.506341,0.046369,0.721685,0.019929,1.064440,0.720675
1/5/16,0.522630,0.111183,0.778943,0.010893,0.184539,0.166924,0.041790,0.489286,0.081241,0.015382,0.199140,0.009503,0.193298,0.508786,0.045834,0.717691,0.020065,1.063310,0.724868
1/6/16,0.516299,0.110623,0.778616,0.010864,0.183921,0.165885,0.041559,0.485131,0.080916,0.015407,0.199130,0.009503,0.193289,0.506380,0.045834,0.718581,0.020014,1.062820,0.724833
1/7/16,0.508623,0.109555,0.785075,0.010796,0.183483,0.163655,0.040912,0.479945,0.080956,0.015386,0.198454,0.009503,0.192633,0.503045,0.044869,0.722156,0.019900,1.052210,0.722373
1/8/16,0.510064,0.109636,0.784564,0.010835,0.184184,0.165283,0.040292,0.478930,0.081042,0.015347,0.198453,0.009503,0.192631,0.503849,0.045201,0.722946,0.019936,1.054800,0.722368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/17/21,0.512476,0.112212,0.810152,0.009387,0.229551,0.169825,0.034487,0.486091,0.079596,0.014262,0.196824,0.009698,0.191050,0.523769,0.045137,0.777566,0.021421,0.950087,0.715050
12/20/21,0.509316,0.112324,0.807641,0.009428,0.227296,0.169531,0.034550,0.482449,0.079247,0.014339,0.196824,0.009643,0.191050,0.523903,0.045197,0.775660,0.021391,0.945806,0.716438
12/21/21,0.509225,0.112359,0.808617,0.009471,0.226266,0.169989,0.034477,0.480517,0.079676,0.014331,0.196678,0.009698,0.190909,0.524090,0.045170,0.776261,0.021255,0.948326,0.715907
12/22/21,0.510355,0.112259,0.808570,0.009465,0.226062,0.169949,0.034477,0.483525,0.080365,0.014334,0.196562,0.009696,0.190796,0.524203,0.045058,0.775299,0.021188,0.952668,0.715485


In [32]:
final_df.to_csv('RawData/Final_filtered_features.csv', index=True)