In [1]:
!pip install pyarrow



In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import io
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets

In [3]:
dfevent=pd.read_parquet('add_event.parquet')
dftrans=pd.read_parquet('add_trans.parquet')
dfoffer=pd.read_parquet('offer_metadata.parquet')
dftest=pd.read_parquet('test_data.parquet')
dftrain=pd.read_parquet('train_data.parquet')

In [4]:
dfevent.head()

Unnamed: 0,id2,id3,id6,id4,id7
0,2431360,618619,Tiles,2023-10-22 08:08:17.768,
1,2431360,363153,Tiles,2023-10-22 08:08:18.921,
2,2431360,97193,Tiles,2023-10-22 08:08:17.765,
3,2431360,654444,Tiles,2023-10-22 08:08:17.737,
4,2431360,32325,Tiles,2023-10-22 08:08:17.812,


In [5]:
dfoffer.head()

Unnamed: 0,id3,id9,f375,f376,f377,id10,id11,f378,f374,id8,id12,id13
0,70687,FO5O,2,5.0,,1,,N,,,2018-01-01 00:00:00,2099-12-31 23:59:59
1,900002526,UGE,2,100.0,,1,,N,,,2014-10-20 00:00:00,2099-12-31 23:59:59
2,900002864,UTP,1,100.0,,1,,N,,,2016-07-19 00:00:00,2099-12-31 23:59:59
3,19508,o,2,,,1,,N,,,2019-06-02 17:00:00,2028-12-31 16:59:59
4,35903,o,2,,,1,,N,,,2019-06-02 17:00:00,2028-12-31 16:59:59


In [6]:
dfoffer['f375'].describe()

count    4164.000000
mean        1.321806
std         0.467225
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: f375, dtype: float64

In [7]:
dftrans.head()

Unnamed: 0,id2,f367,f368,f369,f370,f371,f372,id8,f374
0,2896709,15.6,PBR,D,2023-10-16,19:16:52,202310,59639998,DSE
1,2855047,6.4,PR,D,2023-10-14,13:01:16,202310,59639998,DSE
2,2497175,13.99,PBR,D,2023-10-14,00:31:48,202310,59639998,DSE
3,2655364,15.14,PGC,D,2023-10-13,12:37:25,202310,59639998,DSE
4,2855047,2.12,PR,D,2023-10-09,16:51:21,202310,59639998,DSE


In [8]:
# Total impressions and clicks per offer
click_stats = dfevent.groupby('id3').agg(
    impressions=('id4', 'count'),
    clicks=('id7', lambda x: x.notna().sum())
).reset_index()

# Click-through rate (CTR) per offer
click_stats['click_rate'] = click_stats['clicks'] / click_stats['impressions']

# Merge to dftrain based on id3
dftrain = dftrain.merge(click_stats[['id3', 'click_rate']], on='id3', how='left')


In [9]:
dftest = dftest.merge(click_stats[['id3', 'click_rate']], on='id3', how='left')

In [10]:
# Clean NaT timestamps
dfevent['clicked'] = dfevent['id7'].notna().astype(int)

# Group by (offer, placement) to calculate click rate
placement_ctr_per_offer = dfevent.groupby(['id3', 'id6']).agg(
    impressions=('id4', 'count'),
    clicks=('clicked', 'sum')
).reset_index()

# Calculate CTR
placement_ctr_per_offer['placement_ctr_offer'] = placement_ctr_per_offer['clicks'] / placement_ctr_per_offer['impressions']


In [11]:
# Aggregate CTR across placements to get an offer-level stat
placement_ctr_offer_summary = placement_ctr_per_offer.groupby('id3').agg(
    mean_placement_ctr=('placement_ctr_offer', 'mean'),
    max_placement_ctr=('placement_ctr_offer', 'max')
).reset_index()


In [12]:
dftrain = dftrain.merge(placement_ctr_offer_summary, on='id3', how='left')


In [13]:
dftest = dftest.merge(placement_ctr_offer_summary, on='id3', how='left')


In [14]:
# Offer duration in days
dfoffer['id3'] = dfoffer['id3'].astype(str)
dftrain['id3'] = dftrain['id3'].astype(str)
dfoffer['id12'] = pd.to_datetime(dfoffer['id12'])
dfoffer['id13'] = pd.to_datetime(dfoffer['id13'])
dfoffer['offer_duration'] = (dfoffer['id13'] - dfoffer['id12']).dt.days
# Ensure numeric
dfoffer['f375'] = pd.to_numeric(dfoffer['f375'], errors='coerce')
dfoffer['f376'] = pd.to_numeric(dfoffer['f376'], errors='coerce')

# New feature: Perceived Offer Value Score
# Example: Multiply redemption frequency with discount rate
dfoffer['offer_value_score'] = dfoffer['f375'] * dfoffer['f376']
# Optional: normalize between 0 and 1
dfoffer['offer_value_score_norm'] = (
    dfoffer['offer_value_score'] - dfoffer['offer_value_score'].min()
) / (dfoffer['offer_value_score'].max() - dfoffer['offer_value_score'].min())




In [15]:
# Reduce dfoffer to only necessary columns
dfoffer_small = dfoffer[['id3', 'offer_value_score_norm', 'offer_duration', 'id8']].copy()


In [16]:
dfoffer_small['offer_value_score_norm'] = pd.to_numeric(dfoffer_small['offer_value_score_norm'], downcast='float')
dfoffer_small['offer_duration'] = pd.to_numeric(dfoffer_small['offer_duration'], downcast='integer')
dfoffer_small['id8'] = pd.to_numeric(dfoffer_small['id8'], downcast='integer', errors='coerce')


In [17]:
dftrain_subset = dftrain[['id3']].copy()
dftrain_subset = dftrain_subset.sample(n=100_000, random_state=42)  # Test on 100K rows


In [18]:
dftrain_subset = dftrain_subset.merge(dfoffer_small, on='id3', how='left')


In [19]:
dfoffer_small = dfoffer[['id3', 'offer_value_score_norm', 'offer_duration', 'id8']].copy()
dfoffer_small['offer_value_score_norm'] = pd.to_numeric(dfoffer_small['offer_value_score_norm'], downcast='float')
dfoffer_small['offer_duration'] = pd.to_numeric(dfoffer_small['offer_duration'], downcast='integer')
dfoffer_small['id8'] = pd.to_numeric(dfoffer_small['id8'], downcast='integer', errors='coerce')

dftrain['id3'] = dftrain['id3'].astype(str)
dfoffer_small['id3'] = dfoffer_small['id3'].astype(str)




In [20]:
import pandas as pd

# --------- Step 1: Prepare Offer Data (small & numeric) ---------
dfoffer_small = dfoffer[['id3', 'f375', 'f376', 'id8', 'id12', 'id13']].copy()

# Clean types
dfoffer_small['id3'] = dfoffer_small['id3'].astype(str)
dfoffer_small['f375'] = pd.to_numeric(dfoffer_small['f375'], errors='coerce')
dfoffer_small['f376'] = pd.to_numeric(dfoffer_small['f376'], errors='coerce')
dfoffer_small['id8'] = pd.to_numeric(dfoffer_small['id8'], errors='coerce', downcast='integer')

# Compute additional features
dfoffer_small['id12'] = pd.to_datetime(dfoffer_small['id12'])
dfoffer_small['id13'] = pd.to_datetime(dfoffer_small['id13'])
dfoffer_small['offer_duration'] = (dfoffer_small['id13'] - dfoffer_small['id12']).dt.days
dfoffer_small['offer_value_score'] = dfoffer_small['f375'] * dfoffer_small['f376']

# Normalize score
min_score = dfoffer_small['offer_value_score'].min()
max_score = dfoffer_small['offer_value_score'].max()
dfoffer_small['offer_value_score_norm'] = (dfoffer_small['offer_value_score'] - min_score) / (max_score - min_score)

# Keep only useful columns
dfoffer_small = dfoffer_small[['id3', 'offer_duration', 'offer_value_score_norm', 'id8']]

# --------- Step 2: Merge in Batches ---------
dftrain['id3'] = dftrain['id3'].astype(str)

# Config
batch_size = 100_00  # adjust this depending on RAM
n = len(dftrain)
merged_batches = []

print(f"Total rows: {n} | Processing in batches of {batch_size}")

for i in range(0, n, batch_size):
    print(f"Merging batch {i} to {min(i + batch_size, n)}...")
    df_batch = dftrain.iloc[i:i+batch_size].copy()
    df_merged = df_batch.merge(dfoffer_small, on='id3', how='left')
    merged_batches.append(df_merged)

# --------- Step 3: Concatenate Results ---------
dftrain_merged = pd.concat(merged_batches, ignore_index=True)

print("✅ Merge complete. Final shape:", dftrain_merged.shape)


Total rows: 770164 | Processing in batches of 10000
Merging batch 0 to 10000...
Merging batch 10000 to 20000...
Merging batch 20000 to 30000...
Merging batch 30000 to 40000...
Merging batch 40000 to 50000...
Merging batch 50000 to 60000...
Merging batch 60000 to 70000...
Merging batch 70000 to 80000...
Merging batch 80000 to 90000...
Merging batch 90000 to 100000...
Merging batch 100000 to 110000...
Merging batch 110000 to 120000...
Merging batch 120000 to 130000...
Merging batch 130000 to 140000...
Merging batch 140000 to 150000...
Merging batch 150000 to 160000...
Merging batch 160000 to 170000...
Merging batch 170000 to 180000...
Merging batch 180000 to 190000...
Merging batch 190000 to 200000...
Merging batch 200000 to 210000...
Merging batch 210000 to 220000...
Merging batch 220000 to 230000...
Merging batch 230000 to 240000...
Merging batch 240000 to 250000...
Merging batch 250000 to 260000...
Merging batch 260000 to 270000...
Merging batch 270000 to 280000...
Merging batch 28000

In [21]:
import pandas as pd

# --------- Step 1: Prepare Offer Data (small & numeric) ---------
dfoffer_small = dfoffer[['id3', 'f375', 'f376', 'id8', 'id12', 'id13']].copy()

# Clean types
dfoffer_small['id3'] = dfoffer_small['id3'].astype(str)
dfoffer_small['f375'] = pd.to_numeric(dfoffer_small['f375'], errors='coerce')
dfoffer_small['f376'] = pd.to_numeric(dfoffer_small['f376'], errors='coerce')
dfoffer_small['id8'] = pd.to_numeric(dfoffer_small['id8'], errors='coerce', downcast='integer')

# Compute additional features
dfoffer_small['id12'] = pd.to_datetime(dfoffer_small['id12'])
dfoffer_small['id13'] = pd.to_datetime(dfoffer_small['id13'])
dfoffer_small['offer_duration'] = (dfoffer_small['id13'] - dfoffer_small['id12']).dt.days
dfoffer_small['offer_value_score'] = dfoffer_small['f375'] * dfoffer_small['f376']

# Normalize score
min_score = dfoffer_small['offer_value_score'].min()
max_score = dfoffer_small['offer_value_score'].max()
dfoffer_small['offer_value_score_norm'] = (dfoffer_small['offer_value_score'] - min_score) / (max_score - min_score)

# Keep only useful columns
dfoffer_small = dfoffer_small[['id3', 'offer_duration', 'offer_value_score_norm', 'id8']]

# --------- Step 2: Merge in Batches ---------
dftest['id3'] = dftest['id3'].astype(str)

# Config
batch_size = 100_00  # adjust this depending on RAM
n = len(dftrain)
merged_batches = []

print(f"Total rows: {n} | Processing in batches of {batch_size}")

for i in range(0, n, batch_size):
    print(f"Merging batch {i} to {min(i + batch_size, n)}...")
    df_batch = dftest.iloc[i:i+batch_size].copy()
    df_merged = df_batch.merge(dfoffer_small, on='id3', how='left')
    merged_batches.append(df_merged)

# --------- Step 3: Concatenate Results ---------
dftest_merged = pd.concat(merged_batches, ignore_index=True)

print("✅ Merge complete. Final shape:", dftest_merged.shape)


Total rows: 770164 | Processing in batches of 10000
Merging batch 0 to 10000...
Merging batch 10000 to 20000...
Merging batch 20000 to 30000...
Merging batch 30000 to 40000...
Merging batch 40000 to 50000...
Merging batch 50000 to 60000...
Merging batch 60000 to 70000...
Merging batch 70000 to 80000...
Merging batch 80000 to 90000...
Merging batch 90000 to 100000...
Merging batch 100000 to 110000...
Merging batch 110000 to 120000...
Merging batch 120000 to 130000...
Merging batch 130000 to 140000...
Merging batch 140000 to 150000...
Merging batch 150000 to 160000...
Merging batch 160000 to 170000...
Merging batch 170000 to 180000...
Merging batch 180000 to 190000...
Merging batch 190000 to 200000...
Merging batch 200000 to 210000...
Merging batch 210000 to 220000...
Merging batch 220000 to 230000...
Merging batch 230000 to 240000...
Merging batch 240000 to 250000...
Merging batch 250000 to 260000...
Merging batch 260000 to 270000...
Merging batch 270000 to 280000...
Merging batch 28000

In [22]:
dftrain_merged.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f363,f364,f365,f366,click_rate,mean_placement_ctr,max_placement_ctr,offer_duration,offer_value_score_norm,id8
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,0.0,337.0,0.0,0.0,0.059875,0.047329,0.087438,29.0,0.010101,57310000.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,0.0,1010.0,2.0,0.0019801980198019,0.046487,0.036076,0.065646,181.0,,59210000.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,0.0,1010.0,2.0,0.0019801980198019,0.041484,0.032672,0.060251,29.0,0.090909,72310000.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,0.003610108303249,337.0,0.0,0.0,0.042805,0.033358,0.060118,29.0,0.090909,56510500.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,0.0,337.0,0.0,0.0,0.042544,0.033604,0.058566,29.0,0.070707,59991300.0


In [23]:
# Ensure datetime parsing
dftrans['f370'] = pd.to_datetime(dftrans['f370'], errors='coerce')
dftrans['f371'] = pd.to_datetime(dftrans['f371'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')


# Extract quarter from f370
dftrans['quarter'] = dftrans['f370'].dt.quarter

# Extract part of day from f371
dftrans['part_of_day'] = dftrans['f371'].dt.hour.apply(
    lambda x: 'Night' if x < 6 else 'Morning' if x < 12 else 'Afternoon' if x < 18 else 'Evening'
)

In [24]:
# Group by industry (id8) and compute:
# - Number of transactions per quarter
# - Part of day distribution
industry_stats = dftrans.groupby('id8').agg(
    total_txns=('f370', 'count'),
    q1_ratio=('quarter', lambda x: (x == 1).mean()),
    q2_ratio=('quarter', lambda x: (x == 2).mean()),
    q3_ratio=('quarter', lambda x: (x == 3).mean()),
    q4_ratio=('quarter', lambda x: (x == 4).mean()),
    night_ratio=('part_of_day', lambda x: (x == 'Night').mean()),
    morning_ratio=('part_of_day', lambda x: (x == 'Morning').mean()),
    afternoon_ratio=('part_of_day', lambda x: (x == 'Afternoon').mean()),
    evening_ratio=('part_of_day', lambda x: (x == 'Evening').mean())
).reset_index()


In [25]:
# Ensure id8 is same type
dftrain_merged['id8'] = pd.to_numeric(dftrain_merged['id8'], errors='coerce')
industry_stats['id8'] = pd.to_numeric(industry_stats['id8'], errors='coerce')

# Merge
dftrain_merged = dftrain_merged.merge(industry_stats, on='id8', how='left')


In [26]:
dftest_merged['id8'] = pd.to_numeric(dftest_merged['id8'], errors='coerce')
dftest_merged = dftest_merged.merge(industry_stats, on='id8', how='left')

In [27]:
dftrain_merged.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,id8,total_txns,q1_ratio,q2_ratio,q3_ratio,q4_ratio,night_ratio,morning_ratio,afternoon_ratio,evening_ratio
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,57310000.0,37472.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,59210000.0,24597.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,72310000.0,19409.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,56510500.0,57064.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,59991300.0,14005.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [28]:
dftrain_merged.isnull().sum()

id1                   0
id2                   0
id3                   0
id4                   0
id5                   0
                   ... 
q4_ratio           1352
night_ratio        1352
morning_ratio      1352
afternoon_ratio    1352
evening_ratio      1352
Length: 387, dtype: int64

In [29]:
dftest_merged.isnull().sum()

id1                  0
id2                  0
id3                  0
id4                  0
id5                  0
                  ... 
q4_ratio           717
night_ratio        717
morning_ratio      717
afternoon_ratio    717
evening_ratio      717
Length: 386, dtype: int64

In [30]:
# Identify categorical columns without triggering full copy
cat_cols = [col for col, dtype in dftrain_merged.dtypes.items() if dtype == 'object' or dtype.name == 'category']

print(f"🔍 Found {len(cat_cols)} categorical columns:")
print(cat_cols)


🔍 Found 372 categorical columns:
['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f

In [31]:
import pandas as pd

def infer_and_convert_types(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            # Try converting to datetime
            try:
                df[col] = pd.to_datetime(df[col], errors='raise')
                continue  # If successful, move to next column
            except (ValueError, TypeError):
                pass

            # Try converting to numeric
            try:
                df[col] = pd.to_numeric(df[col], errors='raise')
                continue
            except (ValueError, TypeError):
                pass

            # Otherwise, leave as object (likely categorical/text)
    return df

# Run this on your dftrain_merged
dftrain_merged = infer_and_convert_types(dftrain_merged)


  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = 

In [32]:
import pandas as pd

def infer_and_convert_types(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            # Try converting to datetime
            try:
                df[col] = pd.to_datetime(df[col], errors='raise')
                continue  # If successful, move to next column
            except (ValueError, TypeError):
                pass

            # Try converting to numeric
            try:
                df[col] = pd.to_numeric(df[col], errors='raise')
                continue
            except (ValueError, TypeError):
                pass

            # Otherwise, leave as object (likely categorical/text)
    return df

# Run this on your dftrain_merged
dftest_merged = infer_and_convert_types(dftest_merged)


  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = pd.to_datetime(df[col], errors='raise')
  df[col] = 

In [33]:
cat_cols = dftrain_merged.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"🧠 Final Categorical Columns ({len(cat_cols)}): {cat_cols}")


🧠 Final Categorical Columns (10): ['id1', 'f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']


In [34]:
cat_cols = dftest_merged.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"🧠 Final Categorical Columns ({len(cat_cols)}): {cat_cols}")


🧠 Final Categorical Columns (10): ['id1', 'f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']


In [35]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode (excluding 'id1')
to_encode = [col for col in cat_cols if col != 'id1']

# Apply Label Encoding
le = LabelEncoder()
for col in to_encode:
    try:
        dftrain_merged[col] = le.fit_transform(dftrain_merged[col].astype(str))
        print(f"✅ Encoded: {col}")
    except Exception as e:
        print(f"⚠️ Failed to encode {col}: {e}")


✅ Encoded: f42
✅ Encoded: f50
✅ Encoded: f52
✅ Encoded: f53
✅ Encoded: f54
✅ Encoded: f55
✅ Encoded: f56
✅ Encoded: f57
✅ Encoded: f354


In [36]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode (excluding 'id1')
to_encode = [col for col in cat_cols if col != 'id1']

# Apply Label Encoding
le = LabelEncoder()
for col in to_encode:
    try:
        dftest_merged[col] = le.fit_transform(dftest_merged[col].astype(str))
        print(f"✅ Encoded: {col}")
    except Exception as e:
        print(f"⚠️ Failed to encode {col}: {e}")


✅ Encoded: f42
✅ Encoded: f50
✅ Encoded: f52
✅ Encoded: f53
✅ Encoded: f54
✅ Encoded: f55
✅ Encoded: f56
✅ Encoded: f57
✅ Encoded: f354


In [37]:
# Confirm updated types
dftrain_merged[to_encode].dtypes


f42     int32
f50     int32
f52     int32
f53     int32
f54     int32
f55     int32
f56     int32
f57     int32
f354    int32
dtype: object

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [39]:
# Define your target
target = 'y'

# Keep all features except 'y' (target) and 'id1'
features = [col for col in dftrain_merged.columns if col not in [target, 'id1']]

# Define input and target
X = dftrain_merged[features]
y = dftrain_merged[target]


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [41]:

# Step 3: Train Random Forest Classifier
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1  # utilize all CPU cores
)
clf.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = clf.predict(X_test)

# Step 5: Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>)

In [42]:
# Step 1: Remove datetime columns from features
datetime_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
print(f"🕒 Dropping datetime columns: {datetime_cols}")
X = X.drop(columns=datetime_cols)

# Step 2: Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Train Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Step 4: Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = clf.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("📝 Classification Report:\n", classification_report(y_test, y_pred))


🕒 Dropping datetime columns: ['id4', 'id5', 'f112', 'f122', 'f135', 'f136']
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\fahee\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fahee\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1457, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fahee\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1348, in structured_traceback
    return VerboseTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fahee\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1195, in structured_traceback
    formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fahee\

In [58]:
# Drop datetime columns (same as training)
datetime_cols = dftest_merged.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
X_test_final = dftest_merged.drop(columns=datetime_cols)

# Drop 'id1' as per earlier instruction
if 'id1' in X_test_final.columns:
    X_test_final = X_test_final.drop(columns=['id1'])

# Ensure feature columns match training columns exactly
X_test_final = X_test_final[X.columns]


MemoryError: Unable to allocate 2.82 MiB for an array with shape (369301, 1) and data type float64

In [60]:
# Drop datetime columns
datetime_cols = dftest_merged.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
X_test_final = dftest_merged.drop(columns=datetime_cols)

# Drop 'id1' as per your instruction
X_test_final = X_test_final.drop(columns=['id1'], errors='ignore')

# Ensure all training features exist in test set (add missing with default value)
for col in X.columns:
    if col not in X_test_final.columns:
        X_test_final[col] = 0  # or np.nan if appropriate

# Reorder test columns to match training
X_test_final = X_test_final[X.columns]


MemoryError: Unable to allocate 2.82 MiB for an array with shape (1, 369301) and data type datetime64[ns]

In [62]:
# Drop datetime columns
datetime_cols = dftest_merged.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns.tolist()
X_test_final = dftest_merged.drop(columns=datetime_cols)

# Drop 'id1'
X_test_final = X_test_final.drop(columns=['id1'], errors='ignore')

# Identify missing columns
missing_cols = list(set(X.columns) - set(X_test_final.columns))

# Create a DataFrame of missing columns filled with 0
missing_df = pd.DataFrame(0, index=X_test_final.index, columns=missing_cols)

# Concatenate test data with missing columns
X_test_final = pd.concat([X_test_final, missing_df], axis=1)

# Reorder columns to match training
X_test_final = X_test_final[X.columns]


MemoryError: Unable to allocate 2.82 MiB for an array with shape (369301, 1) and data type float64

In [None]:
y_test_pred = clf.predict(X_test_final)


In [None]:
submission_df = pd.DataFrame({
    'id': dftest_merged['id1'],
    'y': y_test_pred
})

submission_df.to_csv("final_submission.csv", index=False)
print("✅ Submission file saved as 'final_submission.csv'")


In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_test_final)


In [None]:
# Create submission DataFrame with required columns
submission_df = dftest_merged[['id1', 'id2', 'id3', 'id5']].copy()
submission_df['pred'] = y_pred


In [68]:
# Save to CSV
submission_df.to_csv('final_submission_amex.csv', index=False)


NameError: name 'submission_df' is not defined

In [70]:
print(submission_df.head())
print(submission_df.columns.tolist())


NameError: name 'submission_df' is not defined

In [72]:
print(dftrain_merged.isnull().sum().sort_values(ascending=False).head(20))


MemoryError: Unable to allocate 752. KiB for an array with shape (1, 770164) and data type bool

In [None]:
# Step 1: Calculate missing percentage per column
missing_percent = dftrain_merged.isnull().mean()

# Step 2: Filter columns with >60% missing
cols_to_drop = missing_percent[missing_percent > 0.6].index

# Step 3: Drop them from both train and test (to ensure same features)
dftrain_merged = dftrain_merged.drop(columns=cols_to_drop)
dftest_merged = dftest_merged.drop(columns=cols_to_drop, errors='ignore')  # in case some columns are missing in test

print(f"Dropped columns: {list(cols_to_drop)}")


In [None]:

# Fill NaNs in training data with column-wise mean
dftrain_merged = dftrain_merged.fillna(dftrain_merged.mean(numeric_only=True))

# Fill NaNs in test data with column-wise mean
dftest_merged = dftest_merged.fillna(dftest_merged.mean(numeric_only=True))


In [None]:
from sklearn.preprocessing import StandardScaler

num_cols = dftrain_merged.select_dtypes(include='number').columns.difference(['y'])

scaler = StandardScaler()
dftrain_merged[num_cols] = scaler.fit_transform(dftrain_merged[num_cols])
dftest_merged[num_cols] = scaler.transform(dftest_merged[num_cols])


In [None]:
X = dftrain_merged.drop(columns=['id1', 'y'])
y = dftrain_merged['y']


In [None]:
# Drop datetime columns
datetime_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns
X = X.drop(columns=datetime_cols)

# Drop object or categorical columns (if any)
X = X.select_dtypes(include=['float64', 'int64'])


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Option 1: Keep 95% variance
pca = PCA(n_components=0.95)

# Option 2: Use fixed number of components
# pca = PCA(n_components=50)

X_pca = pca.fit_transform(X)


In [None]:
print(X_pca.shape)  # Should show reduced dimensionality


In [None]:
from sklearn.model_selection import train_test_split

X_train_pca, X_val_pca, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_pca, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = rf.predict(X_val_pca)
print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))


In [None]:
# Scale and apply PCA on test data
X_test_scaled = scaler.transform(X_test_final)      # same scaler used for training
X_test_pca = pca.transform(X_test_scaled)           # same PCA model used


In [None]:
y_test_pred = rf.predict(X_test_pca)


In [None]:
submission = dftest_merged[['id1', 'id2', 'id3', 'id5']].copy()
submission['pred'] = y_test_pred


In [None]:
submission_df.to_csv('final_submission_amex.csv', index=False)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Train-test split (for evaluation purposes)
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# 2. Define and train the model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# 3. Validation accuracy (optional)
y_val_pred = xgb_model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")



In [None]:
!pip install xgboost


In [None]:

# 4. Final prediction on test set
y_test_pred = xgb_model.predict(X_test_pca)

# 5. Assign to final submission
final_submission['pred'] = y_test_pred

# 6. Save for download
final_submission.to_csv("/mnt/data/final_submission_xgboost.csv", index=False)
print("✅ XGBoost predictions saved as 'final_submission_xgboost.csv'")