In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# requests
file_path = r"C:\Users\ElifYilmaz\Downloads\project 2.0.csv"

df = pd.read_csv(file_path)
df.columns = df.columns.str.lower()

# qualitative 

file2 = r"C:\Users\ElifYilmaz\Downloads\info client.csv"

df2 = pd.read_csv(file2)
df2.columns = df2.columns.str.lower()

#### Create Derived Features

In [3]:
# fixing appointment types

type_mapping = {
    'FU': 'FU',
    'FUVD': 'FU', 
    'REMINDER': 'FU',
    'VDFIELD': 'SC',
    'VD': 'SC'
}

# Create grouped_type column
df['grouped_type'] = df['type'].map(type_mapping)

In [4]:
# Create condition for net events (not cancelled and past)
df['is_net_event'] = df['deletedat'].isna() & df['is_passed']

In [5]:
# Gross counts (all events)
gross_counts = df.groupby(['requestid', 'grouped_type']).size().unstack(fill_value=0)
gross_counts.columns = [f'gross_{col}' for col in gross_counts.columns]

# Net counts (only events that actually happened)
net_counts = df[df['is_net_event']].groupby(['requestid', 'grouped_type']).size().unstack(fill_value=0)
net_counts.columns = [f'net_{col}' for col in net_counts.columns]

In [6]:
# Combine gross and net counts
counts_df = gross_counts.join(net_counts, how='outer').fillna(0)

In [7]:
# turn self ipa event to bbinary 
df2['selfipa_done'] = df2['selfipaimportedat'].notnull().astype(int)
df2.drop('selfipaimportedat', axis=1, inplace=True) 

In [8]:
# Step 1: Convert createdat to datetime
df['createdat'] = pd.to_datetime(df['createdat'])

# Step 2: Get first gross SC and first net FU timestamps
first_gross_sc = df[df['grouped_type'] == 'SC'].groupby('requestid')['createdat'].min()
first_net_fu = df[(df['grouped_type'] == 'FU') & df['is_net_event']].groupby('requestid')['createdat'].min()

# Step 3: Calculate time difference in hours
counts_df['time_first_sc_to_first_net_fu'] = (first_net_fu - first_gross_sc).dt.total_seconds() / 3600

In [9]:
# drop older requests that havent had a sc since march
counts_df = counts_df[counts_df['gross_SC'] != 0]
counts_df['gross_SC'].value_counts() 

gross_SC
1     15250
2      3013
3       700
4       198
5        48
6        12
7         6
10        2
23        1
8         1
Name: count, dtype: int64

In [10]:
# handle missing value for customers who never made to fu
counts_df['time_first_sc_to_first_net_fu'] = counts_df['time_first_sc_to_first_net_fu'].fillna(-1)

# Check final dataset
print("Final dataset info:")
print(f"Shape: {counts_df.shape}")
print(f"No missing values: {counts_df.isnull().sum().sum() == 0}")

Final dataset info:
Shape: (19231, 5)
No missing values: True


In [11]:
# problematic entries with negative time difference: 
# Separate real negative times from NaN fill values
real_negative = counts_df[
    (counts_df['time_first_sc_to_first_net_fu'] < 0) & 
    (counts_df['time_first_sc_to_first_net_fu'] != -1)
]

actual_valid_times = counts_df[counts_df['time_first_sc_to_first_net_fu'] >= 0]

print(f"Actual negative time differences (not NaN fills): {len(real_negative)}")
print(f"Positive time differences: {len(actual_valid_times)}")
print(f"NaN fills (-1): {(counts_df['time_first_sc_to_first_net_fu'] == -1).sum()}")

if len(real_negative) > 0:
    print(f"\nReal negative values range: {real_negative['time_first_sc_to_first_net_fu'].min():.2f} to {real_negative['time_first_sc_to_first_net_fu'].max():.2f} hours")


Actual negative time differences (not NaN fills): 22
Positive time differences: 7351
NaN fills (-1): 11858

Real negative values range: -4146.53 to -22.24 hours


In [12]:
#handle negative time difference
# Remove outliers entirely (set to NaN, then fill with -1)
counts_df.loc[counts_df['time_first_sc_to_first_net_fu'] < -1, 'time_first_sc_to_first_net_fu'] = -1

In [13]:
# Verify the cleanup
print("After removing outliers:")
print(f"Negative values (excluding -1 fills): {((counts_df['time_first_sc_to_first_net_fu'] < 0) & (counts_df['time_first_sc_to_first_net_fu'] != -1)).sum()}")
print(f"Valid positive time differences: {(counts_df['time_first_sc_to_first_net_fu'] > 0).sum()}")
print(f"NaN fills (-1): {(counts_df['time_first_sc_to_first_net_fu'] == -1).sum()}")

# Final dataset summary
print(f"\nFinal dataset shape: {counts_df.shape}")
print("Ready to join with target variable!")

After removing outliers:
Negative values (excluding -1 fills): 0
Valid positive time differences: 7351
NaN fills (-1): 11880

Final dataset shape: (19231, 5)
Ready to join with target variable!


In [14]:
# at this point requestid is the index. make it explicit


counts_df = counts_df.reset_index()
counts_df.head()


Unnamed: 0,requestid,gross_FU,gross_SC,net_FU,net_SC,time_first_sc_to_first_net_fu
0,2039,0,2,0.0,0.0,-1.0
1,3279,0,1,0.0,1.0,-1.0
2,3583,1,1,1.0,1.0,120.043056
3,3729,8,1,8.0,1.0,73.835556
4,3852,2,1,2.0,1.0,42.193889


In [15]:
# Left join to add target variable
#final_df1 = counts_df.merge(contract, on='requestid', how='left')
# unnecessairy join because df2 has netcontractsigned info 

final_df = counts_df.merge(df2, on='requestid', how='left')


# Fill non-matching requestids with 0 for the target
final_df['netcontractsigned'] = final_df['netcontractsigned'].fillna(0)

# Check the result
print(f"Final dataset shape: {final_df.shape}")
print(f"Target variable distribution:")
print(final_df['netcontractsigned'].value_counts())
print(f"\nColumns: {list(final_df.columns)}")

Final dataset shape: (19245, 14)
Target variable distribution:
netcontractsigned
0.0    18809
1.0      436
Name: count, dtype: int64

Columns: ['requestid', 'gross_FU', 'gross_SC', 'net_FU', 'net_SC', 'time_first_sc_to_first_net_fu', 'zipregion', 'evaluationtime', 'desiredinstallationend', 'electricitybill', 'heatingbill', 'mktgparamscore', 'netcontractsigned', 'selfipa_done']


#### Encoding and Null handling

In [16]:
# Majority of bills have -1, 1 and 0

# set wrong bill values to nan
# Replace specific values with NaN
final_df['electricitybill'] = final_df['electricitybill'].replace([0, 1, -1], pd.NA)
final_df['heatingbill'] = final_df['heatingbill'].replace([0, 1, -1], pd.NA)

Optional for bill handling 
def clean_billing_column(series, column_name):
    """Clean billing column: set negative values and extreme outliers to replacement value"""
    
    # Convert to numeric, coerce errors to NaN
    series_clean = pd.to_numeric(series, errors='coerce')
    
    # Define reasonable bounds (adjust these based on your domain knowledge)
    min_reasonable = 0  # Bills shouldn't be negative
    max_reasonable = 2000  # Adjust based on your market (e.g., €1000/month seems high)
    
    # Count issues for reporting
    negative_count = (series_clean < min_reasonable).sum()
    outlier_count = (series_clean > max_reasonable).sum()
    
    print(f"{column_name}:")
    print(f"  - Negative values: {negative_count}")
    print(f"  - Values > {max_reasonable}: {outlier_count}")
    print(f"  - Original NaN: {series.isna().sum()}")
    
    # Replace problematic values
    series_clean.loc[series_clean < min_reasonable] = np.nan  # or -1
    series_clean.loc[series_clean > max_reasonable] = np.nan  # or -1
    
    return series_clean

# Apply cleaning
final_df['heatingbill'] = clean_billing_column(final_df['heatingbill'], 'heatingbill')
final_df['electricitybill'] = clean_billing_column(final_df['electricitybill'], 'electricitybill')

In [17]:
# before encoding, mark the missingness inside the row;
missing_cols = ['zipregion', 'evaluationtime', 'desiredinstallationend', 'electricitybill', 'heatingbill', 'mktgparamscore']

for col in missing_cols:
    final_df[f'{col}_missing'] = final_df[col].isnull().astype(int)

# Check the predictive power of missing indicators
print("Missing indicator correlations with target:")
missing_indicators = [f'{col}_missing' for col in missing_cols]
missing_corrs = final_df[missing_indicators + ['netcontractsigned']].corr()['netcontractsigned'].drop('netcontractsigned')
print(missing_corrs.sort_values())

Missing indicator correlations with target:
electricitybill_missing          -0.047029
desiredinstallationend_missing   -0.010914
evaluationtime_missing           -0.010865
mktgparamscore_missing           -0.003229
zipregion_missing                 0.010433
heatingbill_missing               0.018988
Name: netcontractsigned, dtype: float64


In [18]:
# encoding 
# desiredinstallationend
type_mapping = {
    # Original Italian values
    '3-4mesi': 'three_to_four_months', 
    '5+mesi': 'more_than_5_months',
    '1-2mesi': 'one_to_two_months',
    'Non lo so': 'dont_know',
    'short': np.nan,
    # Already mapped values (keep as-is)
    'dont_know': 'dont_know',
    'three_to_four_months': 'three_to_four_months',
    'one_to_two_months': 'one_to_two_months', 
    'more_than_5_months': 'more_than_5_months',
    # Handle string 'nan'
    'nan': np.nan
}

final_df['desiredinstallationend1'] = final_df['desiredinstallationend'].map(type_mapping)

In [19]:
# evaluationtime
type_mapping = {
    # Original Italian values
    '3-6 mesi': np.nan,
    '<3 mesi': np.nan,
    '>6 mesi': np.nan,
    # Already mapped English values
    'less_than_three_months': np.nan,
    'more_than_six_months': np.nan,
    # Other values that appear in your data
    'understand_need': 'understand_need',  # or map to np.nan if you don't want these
    'understand_purchase': 'understand_purchase',  # or map to np.nan if you don't want these
    'evaluation': 'evaluation',  # or map to np.nan if you don't want these
    'curious': 'curious',  # or map to np.nan if you don't want these
    # Handle string 'nan'
    'nan': np.nan
}

# Create grouped_type column
final_df['evaluationtime1'] = final_df['evaluationtime'].map(type_mapping)


In [20]:
# Replace the old columns directly
final_df['desiredinstallationend'] = final_df['desiredinstallationend1']
final_df['evaluationtime'] = final_df['evaluationtime1']

# Drop the temporary columns
final_df = final_df.drop(['desiredinstallationend1', 'evaluationtime1'], axis=1)

In [21]:
# handle nan before encoding
final_df['desiredinstallationend'] = final_df['desiredinstallationend'].fillna('missing')
final_df['evaluationtime'] = final_df['evaluationtime'].fillna('missing')


In [22]:
# Start with ordinal encoding for time-based features

# Ordinal encoding for desiredinstallationend (time-based order)
time_order = [ 'one_to_two_months', 'three_to_four_months', 'more_than_5_months', 'dont_know', 'missing']
final_df['desiredinstallationend_encoded'] = final_df['desiredinstallationend'].map({val: i for i, val in enumerate(time_order)})

In [23]:
# One-hot encoding for marketing channels (mktgparamscore)
marketing_dummies = pd.get_dummies(final_df['mktgparamscore'], prefix='mktg')
final_df = pd.concat([final_df, marketing_dummies], axis=1)

# One-hot encoding for regions (zipregion) 
region_dummies = pd.get_dummies(final_df['zipregion'], prefix='region')
final_df = pd.concat([final_df, region_dummies], axis=1)

# Label encoding for evaluationtime (mixed ordinal/categorical)
le = LabelEncoder()
final_df['evaluationtime_encoded'] = le.fit_transform(final_df['evaluationtime'])

print(f"Dataset shape after encoding: {final_df.shape}")
print(f"New dummy columns created: {len(marketing_dummies.columns) + len(region_dummies.columns)}")

Dataset shape after encoding: (19245, 55)
New dummy columns created: 33


In [24]:
# Drop original categorical columns and create final feature matrix
columns_to_drop = [ 'zipregion', 'evaluationtime', 'desiredinstallationend', 'mktgparamscore', 'netcontractsigned']
final_df = final_df.drop(columns=columns_to_drop)


print(f"\nNo missing values: {final_df.isnull().sum().sum() == 0}")


No missing values: False


#### Imputing

In [25]:
# Fix electricitybill missing values
final_df['electricitybill'] = final_df['electricitybill'].fillna(final_df['electricitybill'].median())
final_df['heatingbill'] = final_df['heatingbill'].fillna(final_df['heatingbill'].median())

# Fix desiredinstallationend_encoded - NaN means 'Unknown' which should be 0 --> check if needed
#X_final['desiredinstallationend_encoded'] = X_final['desiredinstallationend_encoded'].fillna(0)

# Verify all missing values are gone
print(f"Remaining missing values: {final_df.isnull().sum().sum()}")
print(f"Dataset ready: {final_df.isnull().sum().sum() == 0}")



Remaining missing values: 0
Dataset ready: True


  final_df['electricitybill'] = final_df['electricitybill'].fillna(final_df['electricitybill'].median())
  final_df['heatingbill'] = final_df['heatingbill'].fillna(final_df['heatingbill'].median())


##### Scaling

In [26]:
# scaling when needed
continuous_cols= [
    'time_first_sc_to_first_net_fu',
    'electricitybill', 
    'heatingbill'
]

# Scale only the continuous features
scaler = StandardScaler()
final_df[continuous_cols] = scaler.fit_transform(final_df[continuous_cols])

Adding behavioral data from Booking Calls


In [27]:
behaviour = pd.read_csv(r"df_model.csv")
behaviour.columns = behaviour.columns.str.lower()
merged_df = final_df.merge(behaviour, 
                          left_on='requestid', 
                          right_on='id', 
                          how='left')

In [30]:
# drop requestid
merged_df = merged_df.drop(columns=['requestid', 'id'])

#### Duplicate Check

In [29]:
# Check your actual dataset for duplicates
duplicate_cols = []
cols = merged_df.columns.tolist()

for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if merged_df.equals(merged_df[cols[j]]):
            duplicate_cols.append((cols[i], cols[j]))

print("Duplicate column pairs:", duplicate_cols)
print(f"Original shape: {merged_df.shape}")

# Remove duplicates if any found
X_final_clean = merged_df.loc[:, ~merged_df.columns.duplicated()]
print(f"After removing duplicates: {X_final_clean.shape}")

Duplicate column pairs: []
Original shape: (19245, 73)
After removing duplicates: (19245, 73)
