In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import numpy as np

## Map countries to remove duplication

Countries must be mapped as there are multiple representations of each country in the dataset.

In [47]:
country_mapping = {
    'CA': 'Canada',
    'AU': 'Australia',
    'CI': "Cote D'ivoire",
    'PK': 'Pakistan',
    'AF': 'Afghanistan',
    'KE': 'Kenya',
    'MQ': 'Martinique',
    'GB': 'United Kingdom',
    'SK': 'Slovakia',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'FR': 'France',
    'HR': 'Croatia',
    'PL': 'Poland',
    'ZA': 'South Africa',
    'CH': 'Switzerland',
    'NG': 'Nigeria',
    'SE': 'Sweden',
    'IE': 'Ireland',
    'ES': 'Spain',
    'MX': 'Mexico',
    'IN': 'India',
    'LT': 'Lithuania',
    'GH': 'Ghana',
    'BD': 'Bangladesh',
    'SI': 'Slovenia',
    'HK': 'Hong Kong',
    'NO': 'Norway',
    'NL': 'Netherlands',
    "Cote D'ivoire": "Cote D'ivoire",
    'DO': 'Dominican Republic',
    'PT': 'Portugal',
    'PH': 'Philippines',
    'MY': 'Malaysia',
    'SG': 'Singapore',
    'AT': 'Austria',
    'IT': 'Italy',
    'DK': 'Denmark',
    'TR': 'Turkey',
    'BR': 'Brazil',
    'MA': 'Morocco',
    'TH': 'Thailand',
    'NZ': 'New Zealand',
    'JM': 'Jamaica',
    'KW': 'Kuwait',
    'VN': 'Vietnam',
    'QA': 'Qatar',
    'PE': 'Peru',
    'AL': 'Albania',
    'BB': 'Barbados',
    'RO': 'Romania',
    'BN': 'Brunei',
    'BH': 'Bahrain',
    'U.A.E.': 'United Arab Emirates',
    'RU': 'Russia',
    'CL': 'Chile',
    'MV': 'Maldives',
    'MT': 'Malta',
    'PG': 'Papua New Guinea',
    'UZ': 'Uzbekistan',
    'BG': 'Bulgaria',
    'CO': 'Colombia',
    'LR': 'Liberia',
    'AG': 'Antigua and Barbuda',
    'ZM': 'Zambia',
    'MU': 'Mauritius',
    'AI': 'Anguilla',
    'SA': 'Saudi Arabia',
    'TZ': 'Tanzania',
    'EE': 'Estonia',
    'KR': 'South Korea',
    'KH': 'Cambodia',
    'KY': 'Cayman Islands',
    'JO': 'Jordan',
    'BT': 'Bhutan',
    'MD': 'Moldova',
    'HT': 'Haiti',
    'VC': 'Saint Vincent and The Grenadine',
    'DM': 'Dominica',
    'ME': 'Montenegro',
    'MK': 'North Macedonia',
    'MZ': 'Mozambique',
    'HU': 'Hungary',
    'MN': 'Mongolia',
    'IS': 'Iceland',
    'AR': 'Argentina',
    'EC': 'Ecuador',
    'GU': 'Guam',
    'AW': 'Aruba',
    'PF': 'French Polynesia',
    'LS': 'Lesotho',
    'LU': 'Luxembourg',
    'KN': 'Saint Kitts and Nevis',
    'SX': 'Netherlands',
    'Sint Maarten (Dutch part)' : 'Netherlands',
    'VI': 'Virgin Islands (U.S.)',
    'PY': 'Paraguay',
    'FJ': 'Fiji',
    'GD': 'Grenada',
    'GY': 'Guyana',
    'MF': 'France',
    'TW': 'Taiwan',
    'MW': 'Malawi',
    'GF': 'French Guiana',
    'BJ': 'Benin',
    'GP': 'Guadeloupe',
    'TC': 'Turks and Caicos Islands',
    'BZ': 'Belize',
    'SV': 'El Salvador',
    'BO': 'Bolivia',
    'SN': 'Senegal',
    'GN': 'Guinea',
    'CR': 'Costa Rica',
    'ML': 'Mali',
    'BM': 'Bermuda',
    'BQ': 'Bonaire, Sint Eustatius and Saba',
    'AD': 'Andorra',
    'OM': 'Oman',
    'PM': 'Saint Pierre and Miquelon',
    'RE': 'Reunion',
    'LC': 'Saint Lucia',
    'NC': 'New Caledonia',
    'Kosovo': 'Kosovo',
    'Japan': 'Japan',
    'China': 'China',
    'Venezuela': 'Venezuela',
    'Trinidad and Tobago': 'Trinidad and Tobago',
    'United+Kingdom': 'United Kingdom',
    'Hong+Kong': 'Hong Kong',
    'South+Africa': 'South Africa',
    'Puerto Rico': 'Puerto Rico',
    'Sao Tome and Principe': 'Sao Tome and Principe',
    'French Polynesi': 'French Polynesia',
    'United Arab Emi': 'United Arab Emirates',
    'Hong Kong S.A.R.': 'Hong Kong',
    'Runion--trunc': 'Reunion',
    'Virgin Islands': 'Virgin Islands (U.S.)',
    'Iran': 'Iran',
    'Bosnia and Herz': 'Bosnia and Herzegovina',
    'Saint Vincent a': 'Saint Vincent and The Grenadine',
    'Papua New Guine': 'Papua New Guinea',
    'Myanmar': 'Myanmar',
    'Zimbabwe': 'Zimbabwe',
    'Sierra Leone': 'Sierra Leone',
    'Syria': 'Syria',
    'Trinidad and To': 'Trinidad and Tobago',
    'Palestine': 'Palestine',
    'Burundi': 'Burundi',
    'Laos': 'Laos',
    'Dominican Repub': 'Dominican Republic',
    'United States M': 'United States',
    'Antigua and Bar': 'Antigua and Barbuda',
    'Palau': 'Palau',
    'Djibouti': 'Djibouti',
    'Curaçao': 'Curacao',
    'Swaziland': 'Eswatini',
    'Palestinian Authority': 'Palestine',
    'Mauritania': 'Mauritania',
    'Monaco': 'Monaco',
    'Åland Islands': 'Aland Islands',
    'Gibraltar': 'Gibraltar',
    'Armenia': 'Armenia',
    'Seychelles': 'Seychelles',
    'Paraguay': 'Paraguay',
    'Equatorial Guin': 'Equatorial Guinea',
    'Kyrgyzstan': 'Kyrgyzstan',
    'Faroe Islands': 'Faroe Islands',
    'Antarctica': 'Antarctica',
    'United States': 'United States',
    'Wallis and Futu': 'Wallis and Futuna',
    'Eritrea': 'Eritrea',
    'Turks and Caico': 'Turks and Caicos Islands',
    'land Islands--trunc': 'land Islands',  # Truncated, assuming Åland Islands
    'Congo (Republic': 'Republic of the Congo',
    'Chad': 'Chad',
    'Samoa': 'Samoa',
    'Sao Tome and Pr': 'Sao Tome and Principe',
    'Tonga': 'Tonga',
    'Tahiti': 'French Polynesia',  # Assuming Tahiti is part of French Polynesia
    'Åland Islands': 'Aland Islands',
    'Nauru': 'Nauru',
    'Taiwan': 'Taiwan',
    'Solomon Islands': 'Solomon Islands',
    'Comoros': 'Comoros',
    'NigeriaHouse no 3 Pre': 'Nigeria',  # Unclear data, assuming Nigeria
    'Tajikistan': 'Tajikistan',
    'Nicaragua': 'Nicaragua',
    'Curaçao': 'Curacao',
    'Kiribati': 'Kiribati',
    'Saint Martin (F': 'Saint Martin',
    'Bonaire, Sint E': 'Bonaire, Sint Eustatius and Saba',
    'Saint Kitts and': 'Saint Kitts and Nevis',  # Incomplete entry, assuming Nevis
    'South Georgia a': 'South Georgia and The South Sandwich Islands',  # Incomplete entry, assuming South Sandwich Islands
    'British Indian': 'British Indian Ocean Territory',
    'Guernsey': 'Guernsey',
    'Liechtenstein': 'Liechtenstein',
    'Greenland': 'Greenland',
    'French Southern': 'France',
    'YT' : 'France',
    'Korea': 'South Korea',  # Assuming South Korea
    'BS': 'The Bahamas',
    'ID': 'Indonesia',
    'RÃ©union': 'France',  # Assuming Réunion is part of France
    'AE': 'United Arab Emirates'
}

In [48]:
# Read the CSV file into a DataFrame
login_dataframe = pd.read_csv("Dataset/login.csv")

# Remap the 'country' column using the country_mapping dictionary
login_dataframe['country'] = login_dataframe['country'].map(country_mapping).fillna(login_dataframe['country'])
login_dataframe['reg_date'] = pd.to_datetime(login_dataframe['reg_date'], unit='s')

login_dataframe.head()

Unnamed: 0,login,country,account_currency,reg_date
0,457547,Romania,EUR,2021-02-25 00:15:32
1,474589,Canada,CAD,2021-01-07 02:44:02
2,504321,Canada,CAD,2020-10-14 02:31:50
3,504322,Canada,USD,2020-10-15 04:35:45
4,504326,Canada,USD,2020-10-19 07:39:12


In [49]:
login_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40512 entries, 0 to 40511
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   login             40512 non-null  int64         
 1   country           40505 non-null  object        
 2   account_currency  40512 non-null  object        
 3   reg_date          40512 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 1.2+ MB


In [50]:
daily_reports_df = pd.read_csv("Dataset/daily_report.csv")
daily_reports_df['record_time'] = pd.to_datetime(daily_reports_df['record_time'])

## Load trades dataframe and merge with login

Only accounts that have registered within the timeframe of trades.csv are considered. This is because we do not have complete trading data for accounts before that and do not want to misrepresent data which may negatively influence a predictive model.

In [51]:
# Load and prepare the trades data frame
trades_dataframe = pd.read_csv("Dataset/trades.csv")
trades_dataframe['open_time'] = pd.to_datetime(trades_dataframe['open_time'], unit='s')
trades_dataframe['close_time'] = pd.to_datetime(trades_dataframe['close_time'], unit='s')


In [52]:
# Range of trades_dataframe
min_trade_date1 = trades_dataframe['open_time'].min()
max_trade_date1 = trades_dataframe['close_time'].max()
print(max_trade_date1 - min_trade_date1)

422 days 23:55:32


In [53]:
trades_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521777 entries, 0 to 4521776
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   ticket       int64         
 1   login        int64         
 2   symbol       object        
 3   cmd          int64         
 4   volume       float64       
 5   open_time    datetime64[ns]
 6   open_price   float64       
 7   close_time   datetime64[ns]
 8   close_price  float64       
 9   tp           float64       
 10  sl           float64       
 11  reason       int64         
 12  commission   float64       
 13  swaps        float64       
 14  profit       float64       
 15  volume_usd   float64       
dtypes: datetime64[ns](2), float64(9), int64(4), object(1)
memory usage: 552.0+ MB


In [54]:
# Check no. unique accounts in trades
trades_dataframe['login'].nunique()

11976

In [55]:
# Identify the minimum and maximum trade dates
min_trade_date = trades_dataframe['open_time'].min()
max_trade_date = trades_dataframe['close_time'].max()

# Filter the login dataframe to include only accounts registered within the trade dates range
filtered_login_dataframe = login_dataframe[(login_dataframe['reg_date'] >= min_trade_date) & (login_dataframe['reg_date'] <= max_trade_date)]

# Merge the filtered login data with the trades data
merged_df = pd.merge(trades_dataframe, filtered_login_dataframe, on='login', how='inner')

# Combine Daily Charts

In [56]:
daily_charts = glob.glob("Dataset/daily_chart/*.csv")
all_charts = pd.DataFrame()

for chart in daily_charts:
    name = Path(chart).stem
    csv = pd.read_csv(chart, index_col='date', parse_dates=True)
    all_charts[name] = csv['close']

Convert all rates to be from AccountCurrency to USD and rename columns


In [57]:
usd_columns = [col for col in all_charts.columns if col.startswith('USD')]
all_charts_USD = all_charts.copy()
all_charts_USD[usd_columns] = 1 / all_charts_USD[usd_columns]
all_charts_USD.columns = [col.replace('USD', '') for col in all_charts_USD.columns]
all_charts_USD['USD'] = float(1)
all_charts_USD

Unnamed: 0_level_0,AUD,EUR,GBP,NZD,CAD,CHF,CNH,HKD,HUF,JPY,MXN,NOK,PLN,SEK,SGD,THB,TRY,ZAR,USD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-01-03,0.67233,1.05472,1.19663,0.62510,0.732032,1.068810,0.144454,0.128003,0.002637,0.007633,0.051531,0.099209,0.225695,0.094656,0.743572,0.029099,0.053501,0.058799,1.0
2023-01-04,0.68296,1.06030,1.20527,0.62923,0.741988,1.076403,0.144975,0.127946,0.002682,0.007541,0.051634,0.099242,0.227394,0.095211,0.746798,0.029495,0.053402,0.059273,1.0
2023-01-05,0.67513,1.05207,1.18981,0.62292,0.737039,1.068079,0.145178,0.127996,0.002658,0.007496,0.051754,0.097625,0.224621,0.093806,0.743378,0.029386,0.053327,0.058268,1.0
2023-01-06,0.68674,1.06437,1.20924,0.63485,0.743859,1.079319,0.146431,0.128114,0.002702,0.007573,0.052284,0.100017,0.227017,0.095164,0.749299,0.029684,0.053385,0.058519,1.0
2023-01-09,0.69091,1.07288,1.21725,0.63708,0.747010,1.085882,0.147518,0.128157,0.002709,0.007583,0.052259,0.100845,0.228689,0.096313,0.751704,0.029948,0.053372,0.059013,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-23,0.65621,1.08190,1.26710,0.61912,0.740960,1.134984,0.138903,0.127915,0.002827,0.006644,0.058499,0.094905,,0.096900,0.745773,0.027824,0.032938,0.051877,1.0
2024-02-26,0.65397,1.08503,1.26843,0.61720,0.740505,1.136454,0.138720,0.127817,0.002789,0.006636,0.058492,0.095128,,0.097266,0.743871,0.027873,0.032178,0.051857,1.0
2024-02-27,0.65426,1.08435,1.26837,0.61698,0.739262,1.138265,0.138654,0.127813,0.002775,0.006644,0.058609,0.094991,,0.097055,0.744120,0.027882,0.032107,0.052390,1.0
2024-02-28,0.64935,1.08361,1.26595,0.60958,0.736469,1.137864,0.138644,0.127753,0.002756,0.006636,0.058509,0.094397,,0.096729,0.742876,0.027814,0.032052,0.051869,1.0


# Converting Account Currency to USD

In [58]:
login_trades_rates = merged_df
login_trades_rates['trade_date'] = pd.to_datetime(pd.to_datetime(login_trades_rates['open_time']).dt.date)

all_charts_USD_stacked = all_charts_USD.stack()
login_trades_rates = pd.merge(login_trades_rates, all_charts_USD_stacked.rename('usd_rate'), how='left', left_on=['trade_date', 'account_currency'], right_index=True)

conversion_columns = ['commission', 'swaps', 'profit']
login_trades_USD = login_trades_rates

for column in conversion_columns:
    login_trades_USD[column] = login_trades_USD [column] * login_trades_USD['usd_rate']

login_trades_USD

Unnamed: 0,ticket,login,symbol,cmd,volume,open_time,open_price,close_time,close_price,tp,...,reason,commission,swaps,profit,volume_usd,country,account_currency,reg_date,trade_date,usd_rate
0,68880703,7062462,XAUUSD,0,0.01,2024-02-13 17:05:41,1991.35,2024-02-15 16:05:14,2003.01,0.0,...,1,0.0,-1.199330,11.597089,3994.36,Latvia,EUR,2023-12-22 18:51:59,2024-02-13,1.07083
1,68880910,7062462,XAUUSD,0,0.02,2024-02-13 17:08:49,1990.30,2024-02-15 16:05:10,2002.76,0.0,...,1,0.0,-2.409368,24.800423,7986.12,Latvia,EUR,2023-12-22 18:51:59,2024-02-13,1.07083
2,68879706,7062462,XAUUSD,0,0.02,2024-02-13 17:00:10,1995.38,2024-02-15 14:49:53,1998.78,0.0,...,1,0.0,-2.409368,6.778354,7988.32,Latvia,EUR,2023-12-22 18:51:59,2024-02-13,1.07083
3,68873155,7062462,XAUUSD,0,0.01,2024-02-13 16:00:08,2003.21,2024-02-15 16:05:29,2002.88,0.0,...,1,0.0,-1.199330,-0.321249,4006.09,Latvia,EUR,2023-12-22 18:51:59,2024-02-13,1.07083
4,68985063,7062462,DJ30,1,0.01,2024-02-15 10:54:56,38495.30,2024-02-15 10:54:58,38496.60,0.0,...,1,0.0,0.000000,-0.129259,7699.19,Latvia,EUR,2023-12-22 18:51:59,2024-02-15,1.07716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3124162,69488279,814228,NAS100,0,0.05,2024-02-29 16:44:52,18012.10,2024-02-29 16:49:00,18012.60,18014.0,...,5,0.0,0.000000,0.250000,18012.35,Canada,USD,2023-06-05 19:41:15,2024-02-29,1.00000
3124163,69488610,814228,NAS100,0,0.05,2024-02-29 16:53:26,18025.30,2024-02-29 16:56:00,18028.35,18028.0,...,5,0.0,0.000000,1.530000,18026.82,Canada,USD,2023-06-05 19:41:15,2024-02-29,1.00000
3124164,69490039,814228,NAS100,1,0.05,2024-02-29 17:10:05,18008.40,2024-02-29 17:10:26,17993.75,0.0,...,5,0.0,0.000000,7.330000,18001.08,Canada,USD,2023-06-05 19:41:15,2024-02-29,1.00000
3124165,69492259,814228,NAS100,1,0.05,2024-02-29 17:42:52,17986.35,2024-02-29 17:43:26,17976.50,17976.0,...,5,0.0,0.000000,4.930000,17981.42,Canada,USD,2023-06-05 19:41:15,2024-02-29,1.00000


In [59]:
merged_df = login_trades_USD

## Feature engineering and merging

TODO: justify feature engineering

In [60]:
def infer_tp_sl_hit(row):
    if row['cmd'] == 0:  # Buy trade
        if row['close_price'] >= row['tp'] and row['tp'] > 0:
            return 'tp_hit'
        elif row['close_price'] <= row['sl'] and row['sl'] > 0:
            return 'sl_hit'
    elif row['cmd'] == 1:  # Sell trade
        if row['close_price'] <= row['tp'] and row['tp'] > 0:
            return 'tp_hit'
        elif row['close_price'] >= row['sl'] and row['sl'] > 0:
            return 'sl_hit'
    return 'none'

In [61]:
# Count the number of trades per account
total_trades_per_login = merged_df.groupby('login')['ticket'].count()

# Count the number and percentage of buy trades per account
buy_trades_per_login = merged_df[merged_df['cmd'] == 0].groupby('login')['ticket'].count()
percentage_buys = (buy_trades_per_login / total_trades_per_login * 100).fillna(0)

# Calculate various mean averages
average_volume_per_login = merged_df.groupby('login')['volume'].mean() 
average_volume_usd_per_login = merged_df.groupby('login')['volume_usd'].mean()
average_commission_per_login = merged_df.groupby('login')['commission'].mean()
average_swaps_per_login = merged_df.groupby('login')['swaps'].mean()
average_profit_per_login = merged_df.groupby('login')['profit'].mean()

# Calculate ratio of profitable trades
profitable_trades = merged_df[merged_df['profit'] > 0]
ratio_profitable_trades = profitable_trades.groupby('login').size() / total_trades_per_login.replace(0, pd.NA)
# ratio_profitable_trades = ratio_profitable_trades.fillna(0)

# Calculate profit and loss variability per account
profit_loss_variability = merged_df.groupby('login')['profit'].std()
# profit_loss_variability = profit_loss_variability.fillna(0)

# Calculate average trade duration per account
merged_df['trade_duration'] = (merged_df['close_time'] - merged_df['open_time']).dt.total_seconds()
average_trade_duration = merged_df.groupby('login')['trade_duration'].mean()

# Calculate average DPM per account
merged_df['DPM'] = merged_df['profit'] / (merged_df['volume_usd'] / 1e6)  # Converting volume from USD to million USD
average_dpm_per_login = merged_df.groupby('login')['DPM'].mean()

# Find the most common reason per account
reason_per_login = merged_df.groupby('login')['reason'].apply(lambda x: x.value_counts().idxmax())

# Trading Product Diversity: Total unique symbols traded per account
unique_symbols_traded = merged_df.groupby('login')['symbol'].nunique()

# Peak Trading Times: Most frequent trading hour per account
merged_df['trade_hour'] = merged_df['open_time'].dt.hour
peak_trading_times = merged_df.groupby('login')['trade_hour'].agg(lambda x: x.value_counts().idxmax())


# TP/SL calculations
# Add column to dataset whether tp or sl has been hit
merged_df['tp_sl_hit'] = merged_df.apply(infer_tp_sl_hit, axis=1)

# Calculate the TP and SL hit frequencies for each account
tp_hits = merged_df[merged_df['tp_sl_hit'] == 'tp_hit'].groupby('login').size()
sl_hits = merged_df[merged_df['tp_sl_hit'] == 'sl_hit'].groupby('login').size()

# Calculate the TP/SL hit frequency ratio (ensure no division by zero)
# Replace 0 with a small number to avoid division by zero or use np.where to handle 0 cases
tp_sl_hit_frequency_ratio = (tp_hits / sl_hits.replace(0, 1)).fillna(0)

# Calculate the average profit for trades where TP was hit
average_profit_tp = merged_df[merged_df['tp_sl_hit'] == 'tp_hit'].groupby('login')['profit'].mean()

# Calculate the average loss for trades where SL was hit
average_loss_sl = merged_df[merged_df['tp_sl_hit'] == 'sl_hit'].groupby('login')['profit'].mean()

# Calculate the Reward-to-Risk Ratio
# Note: Ensure no division by zero
reward_to_risk_ratio = (average_profit_tp / -average_loss_sl.replace(0, pd.NA)).fillna(0)


# Compile these metrics into a single dataframe
result_dataframe = pd.DataFrame({
    'Total_Trades': total_trades_per_login,
    'Buy_Percentage': percentage_buys,
    'Average_Volume': average_volume_per_login,
    'Average_Volume_USD': average_volume_usd_per_login,
    'Average_DPM': average_dpm_per_login,
    'Unique_Symbols_Traded': unique_symbols_traded,
    'Peak_Trading_Times': peak_trading_times,
    'Ratio_Profitable_Trades': ratio_profitable_trades,
    'Profit_Loss_Variability': profit_loss_variability,
    'Average_Trade_Duration': average_trade_duration,
    'TP/SL Hit Ratio': tp_sl_hit_frequency_ratio,
    'Reward_Risk_Ratio': reward_to_risk_ratio,
    'Most_Common_Trading_Method': reason_per_login,
    'Average_Commission': average_commission_per_login,
    'Average_Swaps': average_swaps_per_login,
    'Average_Profit': average_profit_per_login
})

In [62]:
final_dataset = pd.merge(login_dataframe, result_dataframe, on='login', how='inner')

In [63]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8167 entries, 0 to 8166
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   login                       8167 non-null   int64         
 1   country                     8167 non-null   object        
 2   account_currency            8167 non-null   object        
 3   reg_date                    8167 non-null   datetime64[ns]
 4   Total_Trades                8167 non-null   int64         
 5   Buy_Percentage              8167 non-null   float64       
 6   Average_Volume              8167 non-null   float64       
 7   Average_Volume_USD          8167 non-null   float64       
 8   Average_DPM                 8167 non-null   float64       
 9   Unique_Symbols_Traded       8167 non-null   int64         
 10  Peak_Trading_Times          8167 non-null   int32         
 11  Ratio_Profitable_Trades     7373 non-null   float64     

In [64]:
# Print basic statistics for numerical features
result_dataframe.describe()

Unnamed: 0,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,Unique_Symbols_Traded,Peak_Trading_Times,Ratio_Profitable_Trades,Profit_Loss_Variability,Average_Trade_Duration,TP/SL Hit Ratio,Reward_Risk_Ratio,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit
count,8167.0,8167.0,8167.0,8167.0,8167.0,8167.0,8167.0,7373.0,7601.0,8167.0,5823.0,5823.0,8167.0,8167.0,8167.0,8167.0
mean,382.535448,46.059584,0.364016,191021300.0,-110.468719,6.827599,13.742011,0.560796,199.168592,70298.49,2.644592,1.795638,2.568507,-0.370851,-0.013859,-6.158282
std,1279.210863,25.527246,1.677237,1580669000.0,3222.543512,7.976509,4.653871,0.222822,2418.100361,208748.7,29.215382,224.755593,2.312406,3.267827,28.043478,1252.739515
min,1.0,0.0,0.005,1.04,-109247.458671,1.0,0.0,0.006477,0.0,0.0,0.0,-8311.30564,0.0,-108.598893,-1173.06258,-88958.301765
25%,13.0,32.535721,0.025883,23005.25,-302.590492,1.0,10.0,0.4,8.360587,5207.117,0.0,0.0,0.0,0.0,-0.090676,-8.587346
50%,55.0,48.247978,0.066667,433612.0,-47.344006,4.0,15.0,0.566038,26.270862,19141.99,0.012821,0.0,1.0,0.0,-0.001241,-1.070682
75%,249.0,59.223301,0.221082,28193480.0,63.607838,9.0,17.0,0.71875,81.362847,63090.6,0.4,1.059834,5.0,0.0,0.0,0.880951
max,46509.0,100.0,52.016471,68274630000.0,112017.955519,80.0,23.0,1.0,125091.168209,6793694.0,1245.0,9557.708333,6.0,0.0,1045.468282,25268.0


In [65]:
nan_counts = final_dataset.isna().sum()
print("Count of NaN values per column:")
print(nan_counts)

Count of NaN values per column:
login                            0
country                          0
account_currency                 0
reg_date                         0
Total_Trades                     0
Buy_Percentage                   0
Average_Volume                   0
Average_Volume_USD               0
Average_DPM                      0
Unique_Symbols_Traded            0
Peak_Trading_Times               0
Ratio_Profitable_Trades        794
Profit_Loss_Variability        566
Average_Trade_Duration           0
TP/SL Hit Ratio               2344
Reward_Risk_Ratio             2344
Most_Common_Trading_Method       0
Average_Commission               0
Average_Swaps                    0
Average_Profit                   0
dtype: int64


In [66]:
final_dataset

Unnamed: 0,login,country,account_currency,reg_date,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,Unique_Symbols_Traded,Peak_Trading_Times,Ratio_Profitable_Trades,Profit_Loss_Variability,Average_Trade_Duration,TP/SL Hit Ratio,Reward_Risk_Ratio,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit
0,524974,Switzerland,USD,2023-05-17 03:13:02,143,48.251748,0.046364,1.816077e+04,96.362040,1,17,0.979021,1.805020,4123.216783,0.000000,0.000000,1,0.000000,-0.022238,1.667203
1,524978,Austria,EUR,2023-06-07 05:58:36,1392,47.485632,1.230632,4.044965e+08,-3.415139,30,17,0.762931,316.119097,36404.811782,3.375000,0.086763,1,-6.935588,-1.481077,25.668214
2,524979,France,USD,2023-06-07 06:17:30,2194,49.635369,0.013943,6.724644e+03,-264.410462,4,17,0.718323,11.676818,57056.876937,0.087081,-0.549598,5,0.000000,-0.111285,-0.837867
3,524984,Singapore,USD,2023-10-31 08:34:08,244,38.934426,0.104262,1.836608e+06,-53.917476,6,16,0.606557,42.988956,20359.959016,1.923077,-0.155076,1,0.000000,-0.073893,-0.697910
4,760487,Singapore,SGD,2023-01-04 08:48:24,69,15.942029,0.012609,2.783882e+03,-3403.656636,1,17,0.463768,12.396847,46984.231884,0.000000,0.000000,1,0.000000,0.002172,-10.232797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8162,88945036,Nigeria,USD,2023-05-01 04:36:26,3555,51.195499,0.145440,1.789552e+04,78.987175,4,16,0.801688,56.167597,82311.890295,0.000000,0.000000,1,-1.018003,-0.249108,5.763356
8163,88945038,Singapore,SGD,2023-05-05 11:25:20,1306,58.805513,0.078913,1.824879e+04,140.220508,8,17,0.882083,63.497096,77044.003063,53.785714,0.225533,1,-0.700736,-0.125105,-0.125379
8164,1000054825,Singapore,SGD,2023-09-25 12:27:21,829,32.086852,0.039011,2.437941e+07,10.230354,8,19,0.872135,120.096005,28549.583836,74.000000,1083.230382,5,0.000000,-0.268097,-4.613569
8165,1000054838,Saudi Arabia,USD,2023-09-26 17:11:57,4125,56.533333,0.105981,4.312099e+04,-48.437138,2,15,0.684606,48.027839,3697.880242,0.043292,-1.329755,1,0.000000,-0.026504,-2.182613


## Longevity calculations and merging

Longevity is defined as close time of last trade minus open time of first trade for a given account (definition confirmed by Carol). 

For longevity, it's also important to consider whether an account is actively trading or is inactive. An account is considered 'active' if it trades within the mosty recent month of the dataset, and inactive if it doesn't. Accounts that have registered in the most recent month of the dataset have been removed to avoid confusion.

We do not want to label an account that becomes inactive after 1 month the same way as an account that has actively traded for 1 month, but is still ongoing. We can't define longevity for these 'active' accounts as we do not know when they will stop trading. Therefore, it stands to reason that due to having a known duration, 'inactive' accounts are better indicators of longevity, at least in the short term.

Because of this, the dataset is binned into 5 categories, where only 'inactive' accounts are included up until 360 days (trades.csv has a range of 422 days). For '360+' both active and inactive are included as this bin is open ended.

In [67]:
# Calculate the first open time and last close time for each account
first_open_time_per_login = merged_df.groupby('login')['open_time'].min()
last_close_time_per_login = merged_df.groupby('login')['close_time'].max()

# Calculate longevity as the difference in days between the last close time and first open time
longevity_per_login = (last_close_time_per_login - first_open_time_per_login).dt.days

# Define the most recent month based on the latest trade in the dataset
most_recent_month_start = merged_df['close_time'].max().replace(day=1)

# Determine active accounts (trading in the most recent month)
active_accounts = merged_df[merged_df['close_time'] >= most_recent_month_start].groupby('login').size().index

# Mark accounts as active or inactive
longevity_per_login = longevity_per_login.to_frame(name='longevity')
longevity_per_login['active'] = longevity_per_login.index.isin(active_accounts)

# Exclude accounts that were registered in the most recent month
valid_accounts = login_dataframe[login_dataframe['reg_date'] < most_recent_month_start]

# Bin the longevity values
bins = [-1, 30, 90, 180, 270, 360, float('inf')]
longevity_per_login['longevity_bin'] = pd.cut(longevity_per_login['longevity'], bins=bins, labels=False)

# Exclude active accounts from all bins except for 360+
longevity_per_login = longevity_per_login[(longevity_per_login['active'] == False) | (longevity_per_login['longevity_bin'] == 5)]

In [68]:
# Merge longevity information with result_dataframe
result_dataframe = final_dataset.reset_index().merge(longevity_per_login, on='login', how='inner')

In [69]:
# Calculate trading frequency based on total number of trades and longevity
result_dataframe['Trading_Frequency'] = result_dataframe['Total_Trades'] / result_dataframe['longevity']

# Ensure that any potential divide by zero errors are handled
result_dataframe['Trading_Frequency'] = result_dataframe['Trading_Frequency'].fillna(0)

# Replace inf values with 0
result_dataframe['Trading_Frequency'] = result_dataframe['Trading_Frequency'].replace([np.inf, -np.inf], 0)

In [70]:
result_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5725 entries, 0 to 5724
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       5725 non-null   int64         
 1   login                       5725 non-null   int64         
 2   country                     5725 non-null   object        
 3   account_currency            5725 non-null   object        
 4   reg_date                    5725 non-null   datetime64[ns]
 5   Total_Trades                5725 non-null   int64         
 6   Buy_Percentage              5725 non-null   float64       
 7   Average_Volume              5725 non-null   float64       
 8   Average_Volume_USD          5725 non-null   float64       
 9   Average_DPM                 5725 non-null   float64       
 10  Unique_Symbols_Traded       5725 non-null   int64         
 11  Peak_Trading_Times          5725 non-null   int32       

Filtering accounts in the ways described above has reduced our unique accounts from 8167 to 5724

In [72]:
result_dataframe

Unnamed: 0,index,login,country,account_currency,reg_date,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,...,TP/SL Hit Ratio,Reward_Risk_Ratio,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit,longevity,active,longevity_bin,Trading_Frequency
0,1,524978,Austria,EUR,2023-06-07 05:58:36,1392,47.485632,1.230632,4.044965e+08,-3.415139,...,3.375000,0.086763,1,-6.935588,-1.481077,25.668214,142,False,2,9.802817
1,2,524979,France,USD,2023-06-07 06:17:30,2194,49.635369,0.013943,6.724644e+03,-264.410462,...,0.087081,-0.549598,5,0.000000,-0.111285,-0.837867,107,False,2,20.504673
2,4,760487,Singapore,SGD,2023-01-04 08:48:24,69,15.942029,0.012609,2.783882e+03,-3403.656636,...,0.000000,0.000000,1,0.000000,0.002172,-10.232797,13,False,0,5.307692
3,5,804664,Malaysia,USD,2023-09-24 07:02:49,85,63.529412,0.018706,5.490400e+03,17.604088,...,,,5,0.000000,-0.023176,-0.524588,9,False,0,9.444444
4,6,804687,Australia,AUD,2023-01-10 10:26:00,484,51.239669,0.019773,5.271755e+05,-45.416080,...,0.002283,55.989328,5,-0.101129,-0.009749,-0.210692,372,False,5,1.301075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5720,8157,88944953,Canada,CAD,2023-01-12 02:52:40,54,37.037037,0.041667,1.621176e+07,-24.464597,...,,,5,0.000000,-0.005931,-0.134167,381,True,5,0.141732
5721,8159,88944971,Malaysia,USD,2023-02-08 08:52:48,442,46.153846,1.389367,1.249156e+09,-298.590702,...,6.823529,0.195671,1,0.000000,-5.085113,-51.760588,100,False,2,4.420000
5722,8161,88945034,Singapore,USD,2023-04-12 13:01:26,479,40.083507,0.043278,1.522641e+07,-70.501778,...,2.760870,0.623201,1,-0.256347,-0.000271,-1.519937,46,False,1,10.413043
5723,8163,88945038,Singapore,SGD,2023-05-05 11:25:20,1306,58.805513,0.078913,1.824879e+04,140.220508,...,53.785714,0.225533,1,-0.700736,-0.125105,-0.125379,253,False,3,5.162055


In [73]:
result_dataframe['longevity_bin'].value_counts()

longevity_bin
0    3575
1    1216
2     595
3     214
5      69
4      56
Name: count, dtype: int64

Distribution of longevity. Be aware that 5 appears before 4, because there are more accounts in that bin (most likely due to including both active and inactive accounts)

In [74]:
# Filter rows where 'longevity_bin' is 5
filtered_df = result_dataframe[result_dataframe['longevity_bin'] == 5]

# Count the occurrences of True and False in the 'active' column for the filtered rows
active_counts = filtered_df['active'].value_counts()

# Display the counts
print(active_counts)

active
True     63
False     6
Name: count, dtype: int64


Distribution of '360+' days by activity status.

## Merge result_dataframe with daily_report

In [75]:
# Group by 'login' and the month of 'record_time' to prepare for aggregation
daily_reports_df['month_year'] = daily_reports_df['record_time'].dt.to_period('M')

In [76]:
# Group by login and month_year and calculate average net_deposit and credit
monthly_averages = daily_reports_df.groupby(['login', 'month_year']).agg(
    average_net_deposit=('net_deposit', 'mean'),
    average_credit=('credit', 'mean')
).reset_index()

# Aggregate these monthly averages across all months to get an overall monthly average for each login
overall_monthly_averages = monthly_averages.groupby('login').agg(
    average_net_deposit=('average_net_deposit', 'mean'),
    average_credit=('average_credit', 'mean')
)

In [77]:
result_dataframe = result_dataframe.merge(overall_monthly_averages, on='login', how='left')

In [78]:
result_dataframe

Unnamed: 0,index,login,country,account_currency,reg_date,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,...,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit,longevity,active,longevity_bin,Trading_Frequency,average_net_deposit,average_credit
0,1,524978,Austria,EUR,2023-06-07 05:58:36,1392,47.485632,1.230632,4.044965e+08,-3.415139,...,1,-6.935588,-1.481077,25.668214,142,False,2,9.802817,118.543572,0.0
1,2,524979,France,USD,2023-06-07 06:17:30,2194,49.635369,0.013943,6.724644e+03,-264.410462,...,5,0.000000,-0.111285,-0.837867,107,False,2,20.504673,7.316443,0.0
2,4,760487,Singapore,SGD,2023-01-04 08:48:24,69,15.942029,0.012609,2.783882e+03,-3403.656636,...,1,0.000000,0.002172,-10.232797,13,False,0,5.307692,2.648954,0.0
3,5,804664,Malaysia,USD,2023-09-24 07:02:49,85,63.529412,0.018706,5.490400e+03,17.604088,...,5,0.000000,-0.023176,-0.524588,9,False,0,9.444444,0.332571,0.0
4,6,804687,Australia,AUD,2023-01-10 10:26:00,484,51.239669,0.019773,5.271755e+05,-45.416080,...,5,-0.101129,-0.009749,-0.210692,372,False,5,1.301075,2.947076,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5720,8157,88944953,Canada,CAD,2023-01-12 02:52:40,54,37.037037,0.041667,1.621176e+07,-24.464597,...,5,0.000000,-0.005931,-0.134167,381,True,5,0.141732,0.178571,0.0
5721,8159,88944971,Malaysia,USD,2023-02-08 08:52:48,442,46.153846,1.389367,1.249156e+09,-298.590702,...,1,0.000000,-5.085113,-51.760588,100,False,2,4.420000,1.773400,0.0
5722,8161,88945034,Singapore,USD,2023-04-12 13:01:26,479,40.083507,0.043278,1.522641e+07,-70.501778,...,1,-0.256347,-0.000271,-1.519937,46,False,1,10.413043,0.000000,0.0
5723,8163,88945038,Singapore,SGD,2023-05-05 11:25:20,1306,58.805513,0.078913,1.824879e+04,140.220508,...,1,-0.700736,-0.125105,-0.125379,253,False,3,5.162055,0.000000,0.0


## Final pre-processing

Simple pre-processing including converting less important trading methods to 'Other' and then reverse encoding, filling NaN values, and reording columns.

TODO: final pre-processing

In [79]:
result_dataframe['Most_Common_Trading_Method'].value_counts()

Most_Common_Trading_Method
5    2527
0    1957
1    1088
6     142
3      10
4       1
Name: count, dtype: int64

In [80]:
# Filter out the dataset
result_dataframe['Most_Common_Trading_Method'] = result_dataframe['Most_Common_Trading_Method'].apply(lambda x: x if x in [0, 1, 5] else 7)

# Rename the codes to strings
result_dataframe['Trading_Method'] = result_dataframe['Most_Common_Trading_Method'].map({0: 'Client', 1: 'Expert', 5: 'Mobile', 7:'Other'})
result_dataframe.drop('Most_Common_Trading_Method', axis=1, inplace=True)

In [81]:
# Remove reg_date
# Remove Total_Trades
# Move trading frequency to front, and active to back
result_dataframe.drop(['index', 'reg_date'], axis=1, inplace=True)

In [82]:
nan_counts = result_dataframe.isna().sum()
print("Count of NaN values per column:")
print(nan_counts)

Count of NaN values per column:
login                         0
country                       0
account_currency              0
Total_Trades                  0
Buy_Percentage                0
Average_Volume                0
Average_Volume_USD            0
Average_DPM                   0
Unique_Symbols_Traded         0
Peak_Trading_Times            0
Ratio_Profitable_Trades     756
Profit_Loss_Variability     556
Average_Trade_Duration        0
TP/SL Hit Ratio            1970
Reward_Risk_Ratio          1970
Average_Commission            0
Average_Swaps                 0
Average_Profit                0
longevity                     0
active                        0
longevity_bin                 0
Trading_Frequency             0
average_net_deposit           1
average_credit                1
Trading_Method                0
dtype: int64


In [83]:
result_dataframe.fillna(0, inplace=True)

In [84]:
result_dataframe['country'] = result_dataframe['country'].astype('category')
result_dataframe['account_currency'] = result_dataframe['account_currency'].astype('category')
result_dataframe['Trading_Method'] = result_dataframe['Trading_Method'].astype('category')
result_dataframe['Peak_Trading_Times'] = result_dataframe['Peak_Trading_Times'].astype('category')

In [85]:
cols = list(result_dataframe.columns)

cols.remove('Trading_Frequency')
cols.remove('active')

# Insert 'Trading Frequency' at the new position
cols.insert(3, 'Trading_Frequency')

# Insert 'Active' at the new position
cols.insert(23, 'active')

cols = [col for col in cols if col not in ['longevity', 'longevity_bin']]
cols += ['longevity', 'longevity_bin']

# Reorder the DataFrame based on the new column list
final_dataset = result_dataframe[cols]

In [86]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5725 entries, 0 to 5724
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   login                    5725 non-null   int64   
 1   country                  5725 non-null   category
 2   account_currency         5725 non-null   category
 3   Trading_Frequency        5725 non-null   float64 
 4   Total_Trades             5725 non-null   int64   
 5   Buy_Percentage           5725 non-null   float64 
 6   Average_Volume           5725 non-null   float64 
 7   Average_Volume_USD       5725 non-null   float64 
 8   Average_DPM              5725 non-null   float64 
 9   Unique_Symbols_Traded    5725 non-null   int64   
 10  Peak_Trading_Times       5725 non-null   category
 11  Ratio_Profitable_Trades  5725 non-null   float64 
 12  Profit_Loss_Variability  5725 non-null   float64 
 13  Average_Trade_Duration   5725 non-null   float64 
 14  TP/SL Hi

In [91]:
final_dataset.describe()

Unnamed: 0,login,Trading_Frequency,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,Unique_Symbols_Traded,Ratio_Profitable_Trades,Profit_Loss_Variability,Average_Trade_Duration,TP/SL Hit Ratio,Reward_Risk_Ratio,Average_Commission,Average_Swaps,Average_Profit,average_net_deposit,average_credit,longevity,longevity_bin
count,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0,5725.0
mean,4260059.0,6.799319,242.507424,44.600729,0.386369,228386400.0,-145.50304,5.700262,0.47258,159.869197,66282.52,1.763087,1.714086,-0.410879,0.184834,-9.806129,15.091317,57.041656,45.232664,0.63179
std,16545730.0,18.322967,1023.377574,28.085715,1.621218,1838682000.0,3692.263888,6.881594,0.283212,1696.792091,209464.5,27.804664,223.521743,3.629525,31.930857,720.966109,103.767705,242.761659,70.013825,1.011621
min,524978.0,0.0,1.0,0.0,0.005,1.04,-109247.458671,1.0,0.0,0.0,0.0,0.0,-8311.30564,-108.598893,-1173.06258,-38585.491238,-2423.933085,0.0,0.0,0.0
25%,814813.0,0.366667,8.0,26.402189,0.028333,26135.92,-467.261699,1.0,0.285714,5.781964,4441.059,0.0,0.0,0.0,-0.060256,-16.592206,0.303777,0.0,2.0,0.0
50%,821583.0,1.876712,33.0,46.650718,0.078333,366373.8,-71.891849,3.0,0.5,23.995345,15700.0,0.0,0.0,0.0,0.0,-1.78125,2.597403,0.0,15.0,0.0
75%,7056823.0,5.833333,142.0,60.0,0.27,22001970.0,44.698309,7.0,0.666667,83.2799,58411.14,0.090909,0.199083,0.0,0.000486,0.220833,8.539722,2.0,60.0,1.0
max,1000055000.0,491.533333,46509.0,100.0,39.285714,68274630000.0,112017.955519,65.0,1.0,87148.616792,6793694.0,1245.0,9557.708333,0.0,1045.468282,15005.0,3721.574881,8000.0,415.0,5.0


In [88]:
for col in final_dataset.columns:
    # Check if the column is of type 'category'
    if final_dataset[col].dtype.name == 'category':
        # Display the value counts for categorical columns
        print("Value Counts for", col, ":")
        print(final_dataset[col].value_counts())
        print()

Value Counts for country :
country
Indonesia                          1209
Vietnam                             761
France                              478
United Kingdom                      474
Canada                              421
                                   ... 
Russia                                1
Dominica                              1
Saint Lucia                           1
Saint Vincent and The Grenadine       1
Zimbabwe                              1
Name: count, Length: 140, dtype: int64

Value Counts for account_currency :
account_currency
USD    3901
EUR     983
GBP     425
CAD     207
AUD      80
NZD      67
SGD      54
CHF       8
Name: count, dtype: int64

Value Counts for Peak_Trading_Times :
Peak_Trading_Times
16    888
15    819
17    722
10    459
11    317
9     276
18    257
14    238
12    238
13    183
20    153
8     152
19    150
4     135
5     130
21    102
22     86
7      85
3      80
6      73
1      57
23     49
2      44
0      32
Name: count,

In [89]:
final_dataset

Unnamed: 0,login,country,account_currency,Trading_Frequency,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_DPM,Unique_Symbols_Traded,...,Reward_Risk_Ratio,Average_Commission,Average_Swaps,Average_Profit,average_net_deposit,average_credit,active,Trading_Method,longevity,longevity_bin
0,524978,Austria,EUR,9.802817,1392,47.485632,1.230632,4.044965e+08,-3.415139,30,...,0.086763,-6.935588,-1.481077,25.668214,118.543572,0.0,False,Expert,142,2
1,524979,France,USD,20.504673,2194,49.635369,0.013943,6.724644e+03,-264.410462,4,...,-0.549598,0.000000,-0.111285,-0.837867,7.316443,0.0,False,Mobile,107,2
2,760487,Singapore,SGD,5.307692,69,15.942029,0.012609,2.783882e+03,-3403.656636,1,...,0.000000,0.000000,0.002172,-10.232797,2.648954,0.0,False,Expert,13,0
3,804664,Malaysia,USD,9.444444,85,63.529412,0.018706,5.490400e+03,17.604088,6,...,0.000000,0.000000,-0.023176,-0.524588,0.332571,0.0,False,Mobile,9,0
4,804687,Australia,AUD,1.301075,484,51.239669,0.019773,5.271755e+05,-45.416080,17,...,55.989328,-0.101129,-0.009749,-0.210692,2.947076,0.0,False,Mobile,372,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5720,88944953,Canada,CAD,0.141732,54,37.037037,0.041667,1.621176e+07,-24.464597,8,...,0.000000,0.000000,-0.005931,-0.134167,0.178571,0.0,True,Mobile,381,5
5721,88944971,Malaysia,USD,4.420000,442,46.153846,1.389367,1.249156e+09,-298.590702,14,...,0.195671,0.000000,-5.085113,-51.760588,1.773400,0.0,False,Expert,100,2
5722,88945034,Singapore,USD,10.413043,479,40.083507,0.043278,1.522641e+07,-70.501778,25,...,0.623201,-0.256347,-0.000271,-1.519937,0.000000,0.0,False,Expert,46,1
5723,88945038,Singapore,SGD,5.162055,1306,58.805513,0.078913,1.824879e+04,140.220508,8,...,0.225533,-0.700736,-0.125105,-0.125379,0.000000,0.0,False,Expert,253,3


## Export

In [90]:
final_dataset.to_csv('Dataset/output_dataset.csv', index=False)