In [112]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import os

In [113]:
# requests
file_path = r"C:\Users\ElifYilmaz\Downloads\project 2.0.csv"

df = pd.read_csv(file_path)
df.columns = df.columns.str.lower()

# qualitative 

file2 = r"C:\Users\ElifYilmaz\Downloads\info client.csv"

df2 = pd.read_csv(file2)
df2.columns = df2.columns.str.lower()

#### Create Derived Features

In [114]:
# fixing appointment types

type_mapping = {
    'FU': 'FU',
    'FUVD': 'FU', 
    'REMINDER': 'FU',
    'VDFIELD': 'SC',
    'VD': 'SC'
}

# Create grouped_type column
df['grouped_type'] = df['type'].map(type_mapping)

In [115]:
# Create condition for net events (not cancelled and past)
df['is_net_event'] = df['deletedat'].isna() & df['is_passed']

In [116]:
# Gross counts (all events)
gross_counts = df.groupby(['requestid', 'grouped_type']).size().unstack(fill_value=0)
gross_counts.columns = [f'gross_{col}' for col in gross_counts.columns]

# Net counts (only events that actually happened)
net_counts = df[df['is_net_event']].groupby(['requestid', 'grouped_type']).size().unstack(fill_value=0)
net_counts.columns = [f'net_{col}' for col in net_counts.columns]

In [117]:
# Combine gross and net counts
counts_df = gross_counts.join(net_counts, how='outer').fillna(0)

In [118]:
# turn self ipa event to bbinary 
df2['selfipa_done'] = df2['selfipaimportedat'].notnull().astype(int)
df2.drop('selfipaimportedat', axis=1, inplace=True) 

In [119]:
# Step 1: Convert createdat to datetime
df['createdat'] = pd.to_datetime(df['createdat'])

# Step 2: Get first gross SC and first net FU timestamps
first_gross_sc = df[df['grouped_type'] == 'SC'].groupby('requestid')['createdat'].min()
first_net_fu = df[(df['grouped_type'] == 'FU') & df['is_net_event']].groupby('requestid')['createdat'].min()

# Step 3: Calculate time difference in hours
counts_df['time_first_sc_to_first_net_fu'] = (first_net_fu - first_gross_sc).dt.total_seconds() / 3600

In [120]:
# drop older requests that havent had a sc since march
counts_df = counts_df[counts_df['gross_SC'] != 0]
counts_df['gross_SC'].value_counts() 

gross_SC
1     15250
2      3013
3       700
4       198
5        48
6        12
7         6
10        2
23        1
8         1
Name: count, dtype: int64

In [121]:
# handle missing value for customers who never made to fu
counts_df['time_first_sc_to_first_net_fu'] = counts_df['time_first_sc_to_first_net_fu'].fillna(-1)

# Check final dataset
print("Final dataset info:")
print(f"Shape: {counts_df.shape}")
print(f"No missing values: {counts_df.isnull().sum().sum() == 0}")

Final dataset info:
Shape: (19231, 5)
No missing values: True


In [122]:
# problematic entries with negative time difference: 
# Separate real negative times from NaN fill values
real_negative = counts_df[
    (counts_df['time_first_sc_to_first_net_fu'] < 0) & 
    (counts_df['time_first_sc_to_first_net_fu'] != -1)
]

actual_valid_times = counts_df[counts_df['time_first_sc_to_first_net_fu'] >= 0]

print(f"Actual negative time differences (not NaN fills): {len(real_negative)}")
print(f"Positive time differences: {len(actual_valid_times)}")
print(f"NaN fills (-1): {(counts_df['time_first_sc_to_first_net_fu'] == -1).sum()}")

if len(real_negative) > 0:
    print(f"\nReal negative values range: {real_negative['time_first_sc_to_first_net_fu'].min():.2f} to {real_negative['time_first_sc_to_first_net_fu'].max():.2f} hours")


Actual negative time differences (not NaN fills): 22
Positive time differences: 7351
NaN fills (-1): 11858

Real negative values range: -4146.53 to -22.24 hours


In [123]:
#handle negative time difference
# Remove outliers entirely (set to NaN, then fill with -1)
counts_df.loc[counts_df['time_first_sc_to_first_net_fu'] < -1, 'time_first_sc_to_first_net_fu'] = -1

In [124]:
# Verify the cleanup
print("After removing outliers:")
print(f"Negative values (excluding -1 fills): {((counts_df['time_first_sc_to_first_net_fu'] < 0) & (counts_df['time_first_sc_to_first_net_fu'] != -1)).sum()}")
print(f"Valid positive time differences: {(counts_df['time_first_sc_to_first_net_fu'] > 0).sum()}")
print(f"NaN fills (-1): {(counts_df['time_first_sc_to_first_net_fu'] == -1).sum()}")

# Final dataset summary
print(f"\nFinal dataset shape: {counts_df.shape}")
print("Ready to join with target variable!")

After removing outliers:
Negative values (excluding -1 fills): 0
Valid positive time differences: 7351
NaN fills (-1): 11880

Final dataset shape: (19231, 5)
Ready to join with target variable!


In [125]:
# at this point requestid is the index. make it explicit


counts_df = counts_df.reset_index()
counts_df.head()


Unnamed: 0,requestid,gross_FU,gross_SC,net_FU,net_SC,time_first_sc_to_first_net_fu
0,2039,0,2,0.0,0.0,-1.0
1,3279,0,1,0.0,1.0,-1.0
2,3583,1,1,1.0,1.0,120.043056
3,3729,8,1,8.0,1.0,73.835556
4,3852,2,1,2.0,1.0,42.193889


In [126]:
# Left join to add target variable
#final_df1 = counts_df.merge(contract, on='requestid', how='left')
# unnecessairy join because df2 has netcontractsigned info 

final_df = counts_df.merge(df2, on='requestid', how='left')


# Fill non-matching requestids with 0 for the target
final_df['netcontractsigned'] = final_df['netcontractsigned'].fillna(0)

# Check the result
print(f"Final dataset shape: {final_df.shape}")
print(f"Target variable distribution:")
print(final_df['netcontractsigned'].value_counts())
print(f"\nColumns: {list(final_df.columns)}")

Final dataset shape: (19245, 14)
Target variable distribution:
netcontractsigned
0.0    18809
1.0      436
Name: count, dtype: int64

Columns: ['requestid', 'gross_FU', 'gross_SC', 'net_FU', 'net_SC', 'time_first_sc_to_first_net_fu', 'zipregion', 'evaluationtime', 'desiredinstallationend', 'electricitybill', 'heatingbill', 'mktgparamscore', 'netcontractsigned', 'selfipa_done']


#### Encoding and Null handling

In [127]:
# Majority of bills have -1, 1 and 0

# set wrong bill values to nan
# Replace specific values with NaN
final_df['electricitybill'] = final_df['electricitybill'].replace([0, 1, -1], pd.NA)
final_df['heatingbill'] = final_df['heatingbill'].replace([0, 1, -1], pd.NA)

Optional for bill handling 
def clean_billing_column(series, column_name):
    """Clean billing column: set negative values and extreme outliers to replacement value"""
    
    # Convert to numeric, coerce errors to NaN
    series_clean = pd.to_numeric(series, errors='coerce')
    
    # Define reasonable bounds (adjust these based on your domain knowledge)
    min_reasonable = 0  # Bills shouldn't be negative
    max_reasonable = 2000  # Adjust based on your market (e.g., €1000/month seems high)
    
    # Count issues for reporting
    negative_count = (series_clean < min_reasonable).sum()
    outlier_count = (series_clean > max_reasonable).sum()
    
    print(f"{column_name}:")
    print(f"  - Negative values: {negative_count}")
    print(f"  - Values > {max_reasonable}: {outlier_count}")
    print(f"  - Original NaN: {series.isna().sum()}")
    
    # Replace problematic values
    series_clean.loc[series_clean < min_reasonable] = np.nan  # or -1
    series_clean.loc[series_clean > max_reasonable] = np.nan  # or -1
    
    return series_clean

# Apply cleaning
final_df['heatingbill'] = clean_billing_column(final_df['heatingbill'], 'heatingbill')
final_df['electricitybill'] = clean_billing_column(final_df['electricitybill'], 'electricitybill')

In [128]:
# before encoding, mark the missingness inside the row;
missing_cols = ['zipregion', 'evaluationtime', 'desiredinstallationend', 'electricitybill', 'heatingbill', 'mktgparamscore']

for col in missing_cols:
    final_df[f'{col}_missing'] = final_df[col].isnull().astype(int)

# Check the predictive power of missing indicators
print("Missing indicator correlations with target:")
missing_indicators = [f'{col}_missing' for col in missing_cols]
missing_corrs = final_df[missing_indicators + ['netcontractsigned']].corr()['netcontractsigned'].drop('netcontractsigned')
print(missing_corrs.sort_values())

Missing indicator correlations with target:
electricitybill_missing          -0.047029
desiredinstallationend_missing   -0.010914
evaluationtime_missing           -0.010865
mktgparamscore_missing           -0.003229
zipregion_missing                 0.010433
heatingbill_missing               0.018988
Name: netcontractsigned, dtype: float64


#### Encoding

In [129]:
# encoding 
# desiredinstallationend
type_mapping = {
    # Original Italian values
    '3-4mesi': 'three_to_four_months', 
    '5+mesi': 'more_than_5_months',
    '1-2mesi': 'one_to_two_months',
    'Non lo so': 'dont_know',
    'short': np.nan,
    # Already mapped values (keep as-is)
    'dont_know': 'dont_know',
    'three_to_four_months': 'three_to_four_months',
    'one_to_two_months': 'one_to_two_months', 
    'more_than_5_months': 'more_than_5_months',
    # Handle string 'nan'
    'nan': np.nan
}

final_df['desiredinstallationend1'] = final_df['desiredinstallationend'].map(type_mapping)

In [130]:
# evaluationtime
type_mapping = {
    # Original Italian values
    '3-6 mesi': np.nan,
    '<3 mesi': np.nan,
    '>6 mesi': np.nan,
    # Already mapped English values
    'less_than_three_months': np.nan,
    'more_than_six_months': np.nan,
    # Other values that appear in your data
    'understand_need': 'understand_need',  # or map to np.nan if you don't want these
    'understand_purchase': 'understand_purchase',  # or map to np.nan if you don't want these
    'evaluation': 'evaluation',  # or map to np.nan if you don't want these
    'curious': 'curious',  # or map to np.nan if you don't want these
    # Handle string 'nan'
    'nan': np.nan
}

# Create grouped_type column
final_df['evaluationtime1'] = final_df['evaluationtime'].map(type_mapping)


In [131]:
# Replace the old columns directly
final_df['desiredinstallationend'] = final_df['desiredinstallationend1']
final_df['evaluationtime'] = final_df['evaluationtime1']

# Drop the temporary columns
final_df = final_df.drop(['desiredinstallationend1', 'evaluationtime1'], axis=1)

In [132]:
# handle nan before encoding
final_df['desiredinstallationend'] = final_df['desiredinstallationend'].fillna('missing')
final_df['evaluationtime'] = final_df['evaluationtime'].fillna('missing')


In [133]:
# Start with ordinal encoding for time-based features

# Ordinal encoding for desiredinstallationend (time-based order)
time_order = [ 'one_to_two_months', 'three_to_four_months', 'more_than_5_months', 'dont_know', 'missing']
final_df['desiredinstallationend_encoded'] = final_df['desiredinstallationend'].map({val: i for i, val in enumerate(time_order)})

##### Analysis to see how to handle marketing and region

In [134]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19245 entries, 0 to 19244
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   requestid                       19245 non-null  int64  
 1   gross_FU                        19245 non-null  int64  
 2   gross_SC                        19245 non-null  int64  
 3   net_FU                          19245 non-null  float64
 4   net_SC                          19245 non-null  float64
 5   time_first_sc_to_first_net_fu   19245 non-null  float64
 6   zipregion                       18637 non-null  object 
 7   evaluationtime                  19245 non-null  object 
 8   desiredinstallationend          19245 non-null  object 
 9   electricitybill                 12527 non-null  object 
 10  heatingbill                     4742 non-null   object 
 11  mktgparamscore                  18319 non-null  object 
 12  netcontractsigned               

In [135]:
# For each categorical column, see conversion rates
print("Conversion rates by category:\n")

for col in ['zipregion', 'mktgparamscore']:  # replace with your actual column names
    conversion_by_cat = final_df.groupby(col)['netcontractsigned'].agg(['count', 'sum', 'mean'])
    conversion_by_cat.columns = ['total_samples', 'conversions', 'conversion_rate']
    conversion_by_cat = conversion_by_cat.sort_values('conversion_rate', ascending=False)
    
    print(f"\n{col.upper()}:")
    print(conversion_by_cat)
    print(f"Overall variation: {conversion_by_cat['conversion_rate'].std():.4f}")

Conversion rates by category:


ZIPREGION:
                       total_samples  conversions  conversion_rate
zipregion                                                         
Friuli-Venezia Giulia           1068         64.0         0.059925
Piemonte                        1717         58.0         0.033780
Valle D'Aosta                     72          2.0         0.027778
Liguria                          469         13.0         0.027719
Emilia-Romagna                  1669         44.0         0.026363
Lombardia                       3083         81.0         0.026273
Veneto                          1466         37.0         0.025239
Toscana                         1293         29.0         0.022428
Basilicata                       148          3.0         0.020270
Marche                           392          7.0         0.017857
Umbria                           280          5.0         0.017857
Lazio                           1908         34.0         0.017820
Trentino-Alto Adige

Marketing gave good variance, would like to maintain that information. Region is not that significant but still good. 
However given the * of unique values, in both columns I will opt for grouping instead of each value having its column. 

Instead of 30+ categorical features, you get ~6-8, keeping the predictive power but losing the noise.

Downside; this grouping should occasionally double checked to see if it still makes sense

##### one hot encoding for marketing and region

In [136]:
# Define performance groups
marketing_groups = {
    'High': ['Referral', 'form_classico', 'Organic', 'Affiliation'],
    'Medium': ['Other', 'Google', 'Youtube', 'Mediago', 'TikTok'], 
    'Low': ['Outbrain', 'Meta', 'Taboola', 'd2d']
}

# Create mapping function
def group_marketing(value):
    for group, channels in marketing_groups.items():
        if value in channels:
            return group
    return 'Low'  # fallback

# Apply grouping
final_df['mktg_grouped'] = final_df['mktgparamscore'].apply(group_marketing)

In [137]:
# Group regions by performance
def group_regions(region):
    if region == 'Friuli-Venezia Giulia':
        return 'High_Performer'
    elif region in ['Piemonte', 'Lombardia', 'Emilia-Romagna', 'Veneto']:
        return 'Large_Good'
    elif region in ['Liguria', 'Toscana', 'Valle D\'Aosta']:
        return 'Medium'
    else:
        return 'Other'

final_df['region_grouped'] = final_df['zipregion'].apply(group_regions)

In [138]:
# One-hot encode the grouped categories
final_df = pd.get_dummies(final_df, columns=['mktg_grouped', 'region_grouped'], prefix=['mktg', 'region'])

In [139]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19245 entries, 0 to 19244
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   requestid                       19245 non-null  int64  
 1   gross_FU                        19245 non-null  int64  
 2   gross_SC                        19245 non-null  int64  
 3   net_FU                          19245 non-null  float64
 4   net_SC                          19245 non-null  float64
 5   time_first_sc_to_first_net_fu   19245 non-null  float64
 6   zipregion                       18637 non-null  object 
 7   evaluationtime                  19245 non-null  object 
 8   desiredinstallationend          19245 non-null  object 
 9   electricitybill                 12527 non-null  object 
 10  heatingbill                     4742 non-null   object 
 11  mktgparamscore                  18319 non-null  object 
 12  netcontractsigned               

In [140]:
# Drop original categorical columns and create final feature matrix
columns_to_drop = [ 'zipregion', 'evaluationtime', 'desiredinstallationend', 'mktgparamscore']
final_df = final_df.drop(columns=columns_to_drop)


print(f"\nNo missing values: {final_df.isnull().sum().sum() == 0}")


No missing values: False


#### Imputing

In [141]:
# Fix electricitybill missing values
final_df['electricitybill'] = final_df['electricitybill'].fillna(final_df['electricitybill'].median())
final_df['heatingbill'] = final_df['heatingbill'].fillna(final_df['heatingbill'].median())

# Fix desiredinstallationend_encoded - NaN means 'Unknown' which should be 0 --> check if needed
#X_final['desiredinstallationend_encoded'] = X_final['desiredinstallationend_encoded'].fillna(0)

# Verify all missing values are gone
print(f"Remaining missing values: {final_df.isnull().sum().sum()}")
print(f"Dataset ready: {final_df.isnull().sum().sum() == 0}")



Remaining missing values: 0
Dataset ready: True


  final_df['electricitybill'] = final_df['electricitybill'].fillna(final_df['electricitybill'].median())
  final_df['heatingbill'] = final_df['heatingbill'].fillna(final_df['heatingbill'].median())


##### Scaling

In [142]:
# scaling when needed
continuous_cols= [
    'time_first_sc_to_first_net_fu',
    'electricitybill', 
    'heatingbill'
]

# Scale only the continuous features
scaler = StandardScaler()
final_df[continuous_cols] = scaler.fit_transform(final_df[continuous_cols])

Adding behavioral data from Booking Calls


In [143]:
behaviour = pd.read_csv(r"processed_data/df_model.csv")
behaviour.columns = behaviour.columns.str.lower()
merged_df = final_df.merge(behaviour, 
                          left_on='requestid', 
                          right_on='id', 
                          how='left')

In [144]:
# drop requestid
#merged_df = merged_df.drop(columns=['requestid', 'id', 'converted'])

#### Duplicate Check

In [145]:
# Check your actual dataset for duplicates
duplicate_cols = []
cols = merged_df.columns.tolist()

for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if merged_df.equals(merged_df[cols[j]]):
            duplicate_cols.append((cols[i], cols[j]))

print("Duplicate column pairs:", duplicate_cols)
print(f"Original shape: {merged_df.shape}")

# Remove duplicates if any found
X_final_clean = merged_df.loc[:, ~merged_df.columns.duplicated()]
print(f"After removing duplicates: {X_final_clean.shape}")

Duplicate column pairs: []
Original shape: (19245, 47)
After removing duplicates: (19245, 47)


In [146]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19245 entries, 0 to 19244
Data columns (total 47 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   requestid                         19245 non-null  int64  
 1   gross_FU                          19245 non-null  int64  
 2   gross_SC                          19245 non-null  int64  
 3   net_FU                            19245 non-null  float64
 4   net_SC                            19245 non-null  float64
 5   time_first_sc_to_first_net_fu     19245 non-null  float64
 6   electricitybill                   19245 non-null  float64
 7   heatingbill                       19245 non-null  float64
 8   netcontractsigned                 19245 non-null  float64
 9   selfipa_done                      19245 non-null  int64  
 10  zipregion_missing                 19245 non-null  int64  
 11  evaluationtime_missing            19245 non-null  int64  
 12  desi

In [147]:
#investigating missing data request ids
# Find rows with missing call data
missing_call_data = merged_df[merged_df['total_bc_attempts'].isnull()]

print(f"Number of rows with missing call data: {len(missing_call_data)}")

# Show the request IDs (assuming you have a request ID column)
if 'requestid' in merged_df.columns:
    print("\nRequest IDs with missing call data:")
    print(missing_call_data['requestid'].tolist())
else:
    print("\nColumns available to identify these rows:")
    print(merged_df.columns.tolist())
    
    # Show index positions as fallback
    print(f"\nIndex positions of missing rows: {missing_call_data.index.tolist()}")

Number of rows with missing call data: 1105

Request IDs with missing call data:
[2039, 3279, 3583, 3729, 3852, 4359, 4689, 5180, 5216, 5837, 6442, 6521, 6749, 6882, 7183, 7653, 7899, 8057, 9106, 9971, 10157, 10622, 10630, 11034, 11446, 12271, 12548, 12741, 13171, 13514, 13648, 13973, 14578, 14807, 14966, 15106, 15626, 15874, 16288, 16594, 16677, 17124, 17870, 17960, 18429, 18517, 18874, 19527, 19753, 20114, 20193, 20680, 20810, 21062, 21188, 21236, 21468, 21708, 22080, 22690, 22879, 23699, 24093, 24359, 25652, 26202, 26230, 27380, 27584, 27621, 27693, 27713, 27787, 27808, 27830, 28210, 30068, 30881, 30941, 31609, 31791, 31932, 32097, 32714, 32990, 33039, 33130, 33550, 33583, 33793, 34334, 34500, 35640, 35995, 37294, 37319, 38479, 39326, 39500, 40136, 40444, 40758, 40958, 41170, 41292, 41364, 41669, 41729, 42087, 42235, 42346, 42708, 42844, 42989, 43202, 43621, 44179, 44795, 45017, 45483, 45612, 46194, 47009, 47240, 47945, 48369, 48407, 48463, 48607, 49417, 49792, 49837, 49844, 50451, 

In [149]:
# Check conversions in missing call data rows
missing_mask = merged_df['total_bc_attempts'].isnull()
print(f"Conversions in missing rows: {merged_df[missing_mask]['netcontractsigned'].sum()}")
print(f"Conversion rate in missing rows: {merged_df[missing_mask]['netcontractsigned'].mean():.4f}")
print(f"Conversion rate in complete rows: {merged_df[~missing_mask]['netcontractsigned'].mean():.4f}")

Conversions in missing rows: 10.0
Conversion rate in missing rows: 0.0090
Conversion rate in complete rows: 0.0235


In [None]:
# some of these requests are very old and dont have propre registstration, some have some registration but essentially there are only 10 contract signed insode t1100, so essentially dropping as a first step of undersampling,

In [150]:
merged_df = merged_df.dropna().copy()
print(f"New dataset shape: {merged_df.shape}")
print(f"New conversion rate: {merged_df['netcontractsigned'].mean():.4f}")
print(f"Total conversions kept: {merged_df['netcontractsigned'].sum()}")

New dataset shape: (18140, 47)
New conversion rate: 0.0235
Total conversions kept: 426.0


In [151]:
# Create subfolder if it doesn't exist
os.makedirs('processed_data', exist_ok=True)

# Save your merged dataframe
#merged_df.to_csv('processed_data/merged_df.csv', index=False)

print("Dataset saved successfully!")

Dataset saved successfully!
