In [116]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
from pathlib import Path

## Map countries to remove duplication

Countries must be mapped as there are multiple representations of each country in the dataset.

In [117]:
country_mapping = {
    'CA': 'Canada',
    'AU': 'Australia',
    'CI': "Cote D'ivoire",
    'PK': 'Pakistan',
    'AF': 'Afghanistan',
    'KE': 'Kenya',
    'MQ': 'Martinique',
    'GB': 'United Kingdom',
    'SK': 'Slovakia',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'FR': 'France',
    'HR': 'Croatia',
    'PL': 'Poland',
    'ZA': 'South Africa',
    'CH': 'Switzerland',
    'NG': 'Nigeria',
    'SE': 'Sweden',
    'IE': 'Ireland',
    'ES': 'Spain',
    'MX': 'Mexico',
    'IN': 'India',
    'LT': 'Lithuania',
    'GH': 'Ghana',
    'BD': 'Bangladesh',
    'SI': 'Slovenia',
    'HK': 'Hong Kong',
    'NO': 'Norway',
    'NL': 'Netherlands',
    "Cote D'ivoire": "Cote D'ivoire",
    'DO': 'Dominican Republic',
    'PT': 'Portugal',
    'PH': 'Philippines',
    'MY': 'Malaysia',
    'SG': 'Singapore',
    'AT': 'Austria',
    'IT': 'Italy',
    'DK': 'Denmark',
    'TR': 'Turkey',
    'BR': 'Brazil',
    'MA': 'Morocco',
    'TH': 'Thailand',
    'NZ': 'New Zealand',
    'JM': 'Jamaica',
    'KW': 'Kuwait',
    'VN': 'Vietnam',
    'QA': 'Qatar',
    'PE': 'Peru',
    'AL': 'Albania',
    'BB': 'Barbados',
    'RO': 'Romania',
    'BN': 'Brunei',
    'BH': 'Bahrain',
    'U.A.E.': 'United Arab Emirates',
    'RU': 'Russia',
    'CL': 'Chile',
    'MV': 'Maldives',
    'MT': 'Malta',
    'PG': 'Papua New Guinea',
    'UZ': 'Uzbekistan',
    'BG': 'Bulgaria',
    'CO': 'Colombia',
    'LR': 'Liberia',
    'AG': 'Antigua and Barbuda',
    'ZM': 'Zambia',
    'MU': 'Mauritius',
    'AI': 'Anguilla',
    'SA': 'Saudi Arabia',
    'TZ': 'Tanzania',
    'EE': 'Estonia',
    'KR': 'South Korea',
    'KH': 'Cambodia',
    'KY': 'Cayman Islands',
    'JO': 'Jordan',
    'BT': 'Bhutan',
    'MD': 'Moldova',
    'HT': 'Haiti',
    'VC': 'Saint Vincent and The Grenadine',
    'DM': 'Dominica',
    'ME': 'Montenegro',
    'MK': 'North Macedonia',
    'MZ': 'Mozambique',
    'HU': 'Hungary',
    'MN': 'Mongolia',
    'IS': 'Iceland',
    'AR': 'Argentina',
    'EC': 'Ecuador',
    'GU': 'Guam',
    'AW': 'Aruba',
    'PF': 'French Polynesia',
    'LS': 'Lesotho',
    'LU': 'Luxembourg',
    'KN': 'Saint Kitts and Nevis',
    'SX': 'Netherlands',
    'Sint Maarten (Dutch part)' : 'Netherlands',
    'VI': 'Virgin Islands (U.S.)',
    'PY': 'Paraguay',
    'FJ': 'Fiji',
    'GD': 'Grenada',
    'GY': 'Guyana',
    'MF': 'France',
    'TW': 'Taiwan',
    'MW': 'Malawi',
    'GF': 'French Guiana',
    'BJ': 'Benin',
    'GP': 'Guadeloupe',
    'TC': 'Turks and Caicos Islands',
    'BZ': 'Belize',
    'SV': 'El Salvador',
    'BO': 'Bolivia',
    'SN': 'Senegal',
    'GN': 'Guinea',
    'CR': 'Costa Rica',
    'ML': 'Mali',
    'BM': 'Bermuda',
    'BQ': 'Bonaire, Sint Eustatius and Saba',
    'AD': 'Andorra',
    'OM': 'Oman',
    'PM': 'Saint Pierre and Miquelon',
    'RE': 'Reunion',
    'LC': 'Saint Lucia',
    'NC': 'New Caledonia',
    'Kosovo': 'Kosovo',
    'Japan': 'Japan',
    'China': 'China',
    'Venezuela': 'Venezuela',
    'Trinidad and Tobago': 'Trinidad and Tobago',
    'United+Kingdom': 'United Kingdom',
    'Hong+Kong': 'Hong Kong',
    'South+Africa': 'South Africa',
    'Puerto Rico': 'Puerto Rico',
    'Sao Tome and Principe': 'Sao Tome and Principe',
    'French Polynesi': 'French Polynesia',
    'United Arab Emi': 'United Arab Emirates',
    'Hong Kong S.A.R.': 'Hong Kong',
    'Runion--trunc': 'Reunion',
    'Virgin Islands': 'Virgin Islands (U.S.)',
    'Iran': 'Iran',
    'Bosnia and Herz': 'Bosnia and Herzegovina',
    'Saint Vincent a': 'Saint Vincent and The Grenadine',
    'Papua New Guine': 'Papua New Guinea',
    'Myanmar': 'Myanmar',
    'Zimbabwe': 'Zimbabwe',
    'Sierra Leone': 'Sierra Leone',
    'Syria': 'Syria',
    'Trinidad and To': 'Trinidad and Tobago',
    'Palestine': 'Palestine',
    'Burundi': 'Burundi',
    'Laos': 'Laos',
    'Dominican Repub': 'Dominican Republic',
    'United States M': 'United States',
    'Antigua and Bar': 'Antigua and Barbuda',
    'Palau': 'Palau',
    'Djibouti': 'Djibouti',
    'Curaçao': 'Curacao',
    'Swaziland': 'Eswatini',
    'Palestinian Authority': 'Palestine',
    'Mauritania': 'Mauritania',
    'Monaco': 'Monaco',
    'Åland Islands': 'Aland Islands',
    'Gibraltar': 'Gibraltar',
    'Armenia': 'Armenia',
    'Seychelles': 'Seychelles',
    'Paraguay': 'Paraguay',
    'Equatorial Guin': 'Equatorial Guinea',
    'Kyrgyzstan': 'Kyrgyzstan',
    'Faroe Islands': 'Faroe Islands',
    'Antarctica': 'Antarctica',
    'United States': 'United States',
    'Wallis and Futu': 'Wallis and Futuna',
    'Eritrea': 'Eritrea',
    'Turks and Caico': 'Turks and Caicos Islands',
    'land Islands--trunc': 'land Islands',  # Truncated, assuming Åland Islands
    'Congo (Republic': 'Republic of the Congo',
    'Chad': 'Chad',
    'Samoa': 'Samoa',
    'Sao Tome and Pr': 'Sao Tome and Principe',
    'Tonga': 'Tonga',
    'Tahiti': 'French Polynesia',  # Assuming Tahiti is part of French Polynesia
    'Åland Islands': 'Aland Islands',
    'Nauru': 'Nauru',
    'Taiwan': 'Taiwan',
    'Solomon Islands': 'Solomon Islands',
    'Comoros': 'Comoros',
    'NigeriaHouse no 3 Pre': 'Nigeria',  # Unclear data, assuming Nigeria
    'Tajikistan': 'Tajikistan',
    'Nicaragua': 'Nicaragua',
    'Curaçao': 'Curacao',
    'Kiribati': 'Kiribati',
    'Saint Martin (F': 'Saint Martin',
    'Bonaire, Sint E': 'Bonaire, Sint Eustatius and Saba',
    'Saint Kitts and': 'Saint Kitts and Nevis',  # Incomplete entry, assuming Nevis
    'South Georgia a': 'South Georgia and The South Sandwich Islands',  # Incomplete entry, assuming South Sandwich Islands
    'British Indian': 'British Indian Ocean Territory',
    'Guernsey': 'Guernsey',
    'Liechtenstein': 'Liechtenstein',
    'Greenland': 'Greenland',
    'French Southern': 'France',
    'YT' : 'France',
    'Korea': 'South Korea',  # Assuming South Korea
    'BS': 'The Bahamas',
    'ID': 'Indonesia',
    'RÃ©union': 'France',  # Assuming Réunion is part of France
    'AE': 'United Arab Emirates'
}

In [118]:
# Read the CSV file into a DataFrame
login_dataframe = pd.read_csv("Dataset/login.csv")

# Remap the 'country' column using the country_mapping dictionary
login_dataframe['country'] = login_dataframe['country'].map(country_mapping).fillna(login_dataframe['country'])
login_dataframe['reg_date'] = pd.to_datetime(login_dataframe['reg_date'], unit='s')

login_dataframe.head()

Unnamed: 0,login,country,account_currency,reg_date
0,457547,Romania,EUR,2021-02-25 00:15:32
1,474589,Canada,CAD,2021-01-07 02:44:02
2,504321,Canada,CAD,2020-10-14 02:31:50
3,504322,Canada,USD,2020-10-15 04:35:45
4,504326,Canada,USD,2020-10-19 07:39:12


In [119]:
login_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40512 entries, 0 to 40511
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   login             40512 non-null  int64         
 1   country           40505 non-null  object        
 2   account_currency  40512 non-null  object        
 3   reg_date          40512 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 1.2+ MB


## Load trades dataframe and merge with login

Only accounts that have registered within the timeframe of trades.csv are considered. This is because we do not have complete trading data for accounts before that and do not want to misrepresent data which may negatively influence a predictive model.

In [120]:
# Load and prepare the trades data frame
trades_dataframe = pd.read_csv("Dataset/trades.csv")
trades_dataframe['open_time'] = pd.to_datetime(trades_dataframe['open_time'], unit='s')
trades_dataframe['close_time'] = pd.to_datetime(trades_dataframe['close_time'], unit='s')


In [121]:
# Range of trades_dataframe
min_trade_date1 = trades_dataframe['open_time'].min()
max_trade_date1 = trades_dataframe['close_time'].max()
print(max_trade_date1 - min_trade_date1)

422 days 23:55:32


In [122]:
trades_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521777 entries, 0 to 4521776
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   ticket       int64         
 1   login        int64         
 2   symbol       object        
 3   cmd          int64         
 4   volume       float64       
 5   open_time    datetime64[ns]
 6   open_price   float64       
 7   close_time   datetime64[ns]
 8   close_price  float64       
 9   tp           float64       
 10  sl           float64       
 11  reason       int64         
 12  commission   float64       
 13  swaps        float64       
 14  profit       float64       
 15  volume_usd   float64       
dtypes: datetime64[ns](2), float64(9), int64(4), object(1)
memory usage: 552.0+ MB


In [123]:
# Check no. unique accounts in trades
trades_dataframe['login'].nunique()

11976

In [124]:
# Identify the minimum and maximum trade dates
min_trade_date = trades_dataframe['open_time'].min()
max_trade_date = trades_dataframe['close_time'].max()

# Filter the login dataframe to include only accounts registered within the trade dates range
filtered_login_dataframe = login_dataframe[(login_dataframe['reg_date'] >= min_trade_date) & (login_dataframe['reg_date'] <= max_trade_date)]

# Merge the filtered login data with the trades data
merged_df = pd.merge(trades_dataframe, filtered_login_dataframe, on='login', how='inner')

# Combine Daily Charts

In [125]:
daily_charts = glob.glob("Dataset/daily_chart/*.csv")
all_charts = pd.DataFrame()

for chart in daily_charts:
    name = Path(chart).stem
    csv = pd.read_csv(chart, index_col='date', parse_dates=True)
    all_charts[name] = csv['close']

Convert all rates to be from AccountCurrency to USD and rename columns


In [126]:
usd_columns = [col for col in all_charts.columns if col.startswith('USD')]
all_charts_USD = all_charts.copy()
all_charts_USD[usd_columns] = 1 / all_charts_USD[usd_columns]
all_charts_USD.columns = [col.replace('USD', '') for col in all_charts_USD.columns]
all_charts_USD

Unnamed: 0_level_0,AUD,EUR,GBP,NZD,CAD,CHF,CNH,HKD,HUF,JPY,MXN,NOK,PLN,SEK,SGD,THB,TRY,ZAR
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-01-03,0.67233,1.05472,1.19663,0.62510,0.732032,1.068810,0.144454,0.128003,0.002637,0.007633,0.051531,0.099209,0.225695,0.094656,0.743572,0.029099,0.053501,0.058799
2023-01-04,0.68296,1.06030,1.20527,0.62923,0.741988,1.076403,0.144975,0.127946,0.002682,0.007541,0.051634,0.099242,0.227394,0.095211,0.746798,0.029495,0.053402,0.059273
2023-01-05,0.67513,1.05207,1.18981,0.62292,0.737039,1.068079,0.145178,0.127996,0.002658,0.007496,0.051754,0.097625,0.224621,0.093806,0.743378,0.029386,0.053327,0.058268
2023-01-06,0.68674,1.06437,1.20924,0.63485,0.743859,1.079319,0.146431,0.128114,0.002702,0.007573,0.052284,0.100017,0.227017,0.095164,0.749299,0.029684,0.053385,0.058519
2023-01-09,0.69091,1.07288,1.21725,0.63708,0.747010,1.085882,0.147518,0.128157,0.002709,0.007583,0.052259,0.100845,0.228689,0.096313,0.751704,0.029948,0.053372,0.059013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-23,0.65621,1.08190,1.26710,0.61912,0.740960,1.134984,0.138903,0.127915,0.002827,0.006644,0.058499,0.094905,,0.096900,0.745773,0.027824,0.032938,0.051877
2024-02-26,0.65397,1.08503,1.26843,0.61720,0.740505,1.136454,0.138720,0.127817,0.002789,0.006636,0.058492,0.095128,,0.097266,0.743871,0.027873,0.032178,0.051857
2024-02-27,0.65426,1.08435,1.26837,0.61698,0.739262,1.138265,0.138654,0.127813,0.002775,0.006644,0.058609,0.094991,,0.097055,0.744120,0.027882,0.032107,0.052390
2024-02-28,0.64935,1.08361,1.26595,0.60958,0.736469,1.137864,0.138644,0.127753,0.002756,0.006636,0.058509,0.094397,,0.096729,0.742876,0.027814,0.032052,0.051869


# Converting Account Currency to USD

In [127]:
# Step 1: Identify rows where account_currency is not USD
non_usd_rows = merged_df[merged_df['account_currency'] != 'USD']

# Step 2: Merge non_usd_rows with all_charts_USD to get conversion rates
merged_data = non_usd_rows.merge(all_charts_USD, how='left', left_on='open_time', right_index=True)

# Step 3: Multiply values in specified columns with conversion rates
conversion_columns = ['commission', 'swaps', 'profit']
converted_rows = pd.DataFrame(index=merged_data.index)

# Step 4: Copy over non-converted columns to the login_report_USD DataFrame
non_conversion_columns = [col for col in merged_df.columns if col not in conversion_columns]
for column in non_conversion_columns:
    converted_rows[column] = merged_data[column]
for column in conversion_columns:
    converted_rows[column] = merged_data[column.replace('USD', '')] * merged_data[column]

# Step 5: Identify rows where account_currency is USD
usd_rows = merged_df[merged_df['account_currency'] == 'USD']

#Step 6: Combine converted and usd rows
merged_df = pd.concat([converted_rows] + [usd_rows], axis=0)

## Feature engineering and merging

TODO: justify feature engineering

In [128]:
# Group by 'login' to get the count of tickets
total_tickets_per_login = merged_df.groupby('login')['ticket'].count()

# Calculate the number and percentage of buy tickets
buy_tickets_per_login = merged_df[merged_df['cmd'] == 0].groupby('login')['ticket'].size().astype('int64').fillna(0)
percentage_buys = (buy_tickets_per_login / total_tickets_per_login) * 100

# Calculate averages for various trade parameters
average_volume_per_login = merged_df.groupby('login')['volume'].mean()
average_volume_usd_per_login = merged_df.groupby('login')['volume_usd'].mean()
average_open_price_per_login = merged_df.groupby('login')['open_price'].mean()
average_close_price_per_login = merged_df.groupby('login')['close_price'].mean()
average_tp_per_login = merged_df.groupby('login')['tp'].mean()
average_sl_per_login = merged_df.groupby('login')['sl'].mean()
average_commission_per_login = merged_df.groupby('login')['commission'].mean()
average_swaps_per_login = merged_df.groupby('login')['swaps'].mean()
average_profit_per_login = merged_df.groupby('login')['profit'].mean()

# Find the most common reason per account
reason_per_login = merged_df.groupby('login')['reason'].apply(lambda x: x.value_counts().idxmax())

# Compile these metrics into a single dataframe
result_dataframe = pd.DataFrame({
    'Total_Trades': total_tickets_per_login,
    'Buy_Percentage': percentage_buys,
    'Average_Volume': average_volume_per_login,
    'Average_Volume_USD': average_volume_usd_per_login,
    'Average_Open_Price': average_open_price_per_login,
    'Average_Close_Price': average_close_price_per_login,
    'Average_TP': average_tp_per_login,
    'Average_SL': average_sl_per_login,
    'Most_Common_Trading_Method': reason_per_login,
    'Average_Commission': average_commission_per_login,
    'Average_Swaps': average_swaps_per_login,
    'Average_Profit': average_profit_per_login
})

In [129]:
final_dataset = pd.merge(login_dataframe, result_dataframe, on='login', how='inner')

In [130]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8167 entries, 0 to 8166
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   login                       8167 non-null   int64         
 1   country                     8167 non-null   object        
 2   account_currency            8167 non-null   object        
 3   reg_date                    8167 non-null   datetime64[ns]
 4   Total_Trades                8167 non-null   int64         
 5   Buy_Percentage              7312 non-null   float64       
 6   Average_Volume              8167 non-null   float64       
 7   Average_Volume_USD          8167 non-null   float64       
 8   Average_Open_Price          8167 non-null   float64       
 9   Average_Close_Price         8167 non-null   float64       
 10  Average_TP                  8167 non-null   float64       
 11  Average_SL                  8167 non-null   float64     

## Longevity calculations and merging

Longevity is defined as close time of last trade minus open time of first trade for a given account (definition confirmed by Carol). 

For longevity, it's also important to consider whether an account is actively trading or is inactive. An account is considered 'active' if it trades within the mosty recent month of the dataset, and inactive if it doesn't. Accounts that have registered in the most recent month of the dataset have been removed to avoid confusion.

We do not want to label an account that becomes inactive after 1 month the same way as an account that has actively traded for 1 month, but is still ongoing. We can't define longevity for these 'active' accounts as we do not know when they will stop trading. Therefore, it stands to reason that due to having a known duration, 'inactive' accounts are better indicators of longevity, at least in the short term.

Because of this, the dataset is binned into 5 categories, where only 'inactive' accounts are included up until 360 days (trades.csv has a range of 422 days). For '360+' both active and inactive are included as this bin is open ended.

In [131]:
# Calculate the first open time and last close time for each account
first_open_time_per_login = merged_df.groupby('login')['open_time'].min()
last_close_time_per_login = merged_df.groupby('login')['close_time'].max()

# Calculate longevity as the difference in days between the last close time and first open time
longevity_per_login = (last_close_time_per_login - first_open_time_per_login).dt.days

# Define the most recent month based on the latest trade in the dataset
most_recent_month_start = merged_df['close_time'].max().replace(day=1)

# Determine active accounts (trading in the most recent month)
active_accounts = merged_df[merged_df['close_time'] >= most_recent_month_start].groupby('login').size().index

# Mark accounts as active or inactive
longevity_per_login = longevity_per_login.to_frame(name='longevity')
longevity_per_login['active'] = longevity_per_login.index.isin(active_accounts)

# Exclude accounts that were registered in the most recent month
valid_accounts = login_dataframe[login_dataframe['reg_date'] < most_recent_month_start]

# Bin the longevity values
bins = [-1, 30, 90, 180, 270, 360, float('inf')]
longevity_per_login['longevity_bin'] = pd.cut(longevity_per_login['longevity'], bins=bins, labels=False)

# Exclude active accounts from all bins except for 360+
longevity_per_login = longevity_per_login[(longevity_per_login['active'] == False) | (longevity_per_login['longevity_bin'] == 5)]

In [132]:
# Merge longevity information with result_dataframe
result_dataframe = final_dataset.reset_index().merge(longevity_per_login, on='login', how='inner')

In [133]:
result_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5725 entries, 0 to 5724
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       5725 non-null   int64         
 1   login                       5725 non-null   int64         
 2   country                     5725 non-null   object        
 3   account_currency            5725 non-null   object        
 4   reg_date                    5725 non-null   datetime64[ns]
 5   Total_Trades                5725 non-null   int64         
 6   Buy_Percentage              4914 non-null   float64       
 7   Average_Volume              5725 non-null   float64       
 8   Average_Volume_USD          5725 non-null   float64       
 9   Average_Open_Price          5725 non-null   float64       
 10  Average_Close_Price         5725 non-null   float64       
 11  Average_TP                  5725 non-null   float64     

Filtering accounts in the ways described above has reduced our unique accounts from 8167 to 5724

In [134]:
result_dataframe

Unnamed: 0,index,login,country,account_currency,reg_date,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_Open_Price,Average_Close_Price,Average_TP,Average_SL,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit,longevity,active,longevity_bin
0,1,524978,Austria,EUR,2023-06-07 05:58:36,1392,47.485632,1.230632,4.044965e+08,513.912144,513.898092,297.756577,168.537030,1,131.075619,101.564667,85080.731043,142,False,2
1,2,524979,France,USD,2023-06-07 06:17:30,2194,49.635369,0.013943,6.724644e+03,20467.746053,20466.447288,2818.566659,12916.827370,5,0.000000,-0.111285,-0.837867,107,False,2
2,4,760487,Singapore,SGD,2023-01-04 08:48:24,69,15.942029,0.012609,2.783882e+03,11010.476377,11077.594638,10337.456522,168.292464,1,0.000000,0.000046,456.025209,13,False,0
3,5,804664,Malaysia,USD,2023-09-24 07:02:49,85,63.529412,0.018706,5.490400e+03,1265.465529,1266.228878,0.000000,0.000000,5,0.000000,-0.023176,-0.524588,9,False,0
4,6,804687,Australia,AUD,2023-01-10 10:26:00,484,51.239669,0.019773,5.271755e+05,433.944222,431.902211,323.711256,431.677983,5,0.034984,0.012311,29.334563,372,False,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5720,8157,88944953,Canada,CAD,2023-01-12 02:52:40,54,37.037037,0.041667,1.621176e+07,648.476864,648.428424,1.797778,0.020074,5,0.000000,0.019602,130.600970,381,True,5
5721,8159,88944971,Malaysia,USD,2023-02-08 08:52:48,442,46.153846,1.389367,1.249156e+09,23.466324,23.470589,0.402277,4.315796,1,0.000000,-5.085113,-51.760588,100,False,2
5722,8161,88945034,Singapore,USD,2023-04-12 13:01:26,479,40.083507,0.043278,1.522641e+07,1549.084244,1548.833586,842.501651,496.148427,1,-0.256347,-0.000271,-1.519937,46,False,1
5723,8163,88945038,Singapore,SGD,2023-05-05 11:25:20,1306,58.805513,0.078913,1.824879e+04,1.225047,1.225408,1.193976,0.243870,1,5.335101,1.792770,7356.851653,253,False,3


In [135]:
result_dataframe['longevity_bin'].value_counts()

longevity_bin
0    3575
1    1216
2     595
3     214
5      69
4      56
Name: count, dtype: int64

Distribution of longevity. Be aware that 5 appears before 4, because there are more accounts in that bin (most likely due to including both active and inactive accounts)

In [136]:
# Filter rows where 'longevity_bin' is 5
filtered_df = result_dataframe[result_dataframe['longevity_bin'] == 5]

# Count the occurrences of True and False in the 'active' column for the filtered rows
active_counts = filtered_df['active'].value_counts()

# Display the counts
print(active_counts)

active
True     63
False     6
Name: count, dtype: int64


Distribution of '360+' days by activity status.

## Final pre-processing

Simple pre-processing including converting less important trading methods to 'Other' and then reverse encoding, filling NaN values, and reording columns.

TODO: final pre-processing

In [137]:
(result_dataframe['Total_Trades'] == 0).sum()

0

In [138]:
result_dataframe['Most_Common_Trading_Method'].value_counts()

Most_Common_Trading_Method
5    2527
0    1957
1    1088
6     142
3      10
4       1
Name: count, dtype: int64

In [139]:
# Filter out the dataset
result_dataframe['Most_Common_Trading_Method'] = result_dataframe['Most_Common_Trading_Method'].apply(lambda x: x if x in [0, 1, 5] else 7)

# Rename the codes to strings
result_dataframe['Trading_Method'] = result_dataframe['Most_Common_Trading_Method'].map({0: 'Client', 1: 'Expert', 5: 'Mobile', 7:'Other'})

# Fill NaN values with appropriate strategy (mean, median, etc.)
result_dataframe.fillna(0, inplace=True)

In [140]:
# Get a list of all column names except 'longevity' and 'longevity_bin'
cols = [col for col in result_dataframe.columns if col not in ['longevity', 'longevity_bin']]

# Append 'longevity' and 'longevity_bin' to the end of the column list
cols += ['longevity', 'longevity_bin']

# Reorder the DataFrame based on the new column list
final_dataset = result_dataframe[cols]

In [141]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5725 entries, 0 to 5724
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       5725 non-null   int64         
 1   login                       5725 non-null   int64         
 2   country                     5725 non-null   object        
 3   account_currency            5725 non-null   object        
 4   reg_date                    5725 non-null   datetime64[ns]
 5   Total_Trades                5725 non-null   int64         
 6   Buy_Percentage              5725 non-null   float64       
 7   Average_Volume              5725 non-null   float64       
 8   Average_Volume_USD          5725 non-null   float64       
 9   Average_Open_Price          5725 non-null   float64       
 10  Average_Close_Price         5725 non-null   float64       
 11  Average_TP                  5725 non-null   float64     

In [142]:
final_dataset

Unnamed: 0,index,login,country,account_currency,reg_date,Total_Trades,Buy_Percentage,Average_Volume,Average_Volume_USD,Average_Open_Price,...,Average_TP,Average_SL,Most_Common_Trading_Method,Average_Commission,Average_Swaps,Average_Profit,active,Trading_Method,longevity,longevity_bin
0,1,524978,Austria,EUR,2023-06-07 05:58:36,1392,47.485632,1.230632,4.044965e+08,513.912144,...,297.756577,168.537030,1,131.075619,101.564667,85080.731043,False,Expert,142,2
1,2,524979,France,USD,2023-06-07 06:17:30,2194,49.635369,0.013943,6.724644e+03,20467.746053,...,2818.566659,12916.827370,5,0.000000,-0.111285,-0.837867,False,Mobile,107,2
2,4,760487,Singapore,SGD,2023-01-04 08:48:24,69,15.942029,0.012609,2.783882e+03,11010.476377,...,10337.456522,168.292464,1,0.000000,0.000046,456.025209,False,Expert,13,0
3,5,804664,Malaysia,USD,2023-09-24 07:02:49,85,63.529412,0.018706,5.490400e+03,1265.465529,...,0.000000,0.000000,5,0.000000,-0.023176,-0.524588,False,Mobile,9,0
4,6,804687,Australia,AUD,2023-01-10 10:26:00,484,51.239669,0.019773,5.271755e+05,433.944222,...,323.711256,431.677983,5,0.034984,0.012311,29.334563,False,Mobile,372,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5720,8157,88944953,Canada,CAD,2023-01-12 02:52:40,54,37.037037,0.041667,1.621176e+07,648.476864,...,1.797778,0.020074,5,0.000000,0.019602,130.600970,True,Mobile,381,5
5721,8159,88944971,Malaysia,USD,2023-02-08 08:52:48,442,46.153846,1.389367,1.249156e+09,23.466324,...,0.402277,4.315796,1,0.000000,-5.085113,-51.760588,False,Expert,100,2
5722,8161,88945034,Singapore,USD,2023-04-12 13:01:26,479,40.083507,0.043278,1.522641e+07,1549.084244,...,842.501651,496.148427,1,-0.256347,-0.000271,-1.519937,False,Expert,46,1
5723,8163,88945038,Singapore,SGD,2023-05-05 11:25:20,1306,58.805513,0.078913,1.824879e+04,1.225047,...,1.193976,0.243870,1,5.335101,1.792770,7356.851653,False,Expert,253,3


## Export

In [143]:
final_dataset.to_csv('Dataset/output_dataset.csv', index=False)