# Load

In [1]:
import pandas as pd

# File path to your dataset
file_path = '/Users/manuel/Documents/GitHub/JeanPierreWeill/Data /MetaAdsOld/Adset_Data_Expanded_and_Cleaned.csv'

# Attempt to load the file with flexible parsing
try:
    data = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')

# Inspect the first few rows
print("Dataset loaded with flexible parsing. Preview:")
print(data.head())





Dataset loaded with flexible parsing. Preview:
   Unnamed: 0                 id                               name  \
0           0  23849302854290177       Auto - TOF - US + UK  #Stack   
1           0  23849302854290177       Auto - TOF - US + UK  #Stack   
2           1  23849302854200177  Auto - TOF - US + UK  #Stack - ON   
3           1  23849302854200177  Auto - TOF - US + UK  #Stack - ON   
4           2  23849302775990177          Feeds - TOF - US - #Stack   

         campaign_id  status    optimization_goal billing_event  bid_amount  \
0  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS     10000.0   
1  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS     10000.0   
2  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS     10000.0   
3  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS     10000.0   
4  23849302775960177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS         NaN   

   budget_remaining                start_time  ... 

In [2]:
# Display basic info about the dataset
print("\nDataset information:")
data.info()


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 40 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Unnamed: 0                          215 non-null    int64  
 1   id                                  215 non-null    int64  
 2   name                                215 non-null    object 
 3   campaign_id                         215 non-null    int64  
 4   status                              215 non-null    object 
 5   optimization_goal                   215 non-null    object 
 6   billing_event                       215 non-null    object 
 7   bid_amount                          12 non-null     float64
 8   budget_remaining                    215 non-null    int64  
 9   start_time                          215 non-null    object 
 10  countries                           215 non-null    object 
 11  age_min                

In [3]:
# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())


Missing values:
Unnamed: 0                              0
id                                      0
name                                    0
campaign_id                             0
status                                  0
optimization_goal                       0
billing_event                           0
bid_amount                            203
budget_remaining                        0
start_time                              0
countries                               0
age_min                                 0
age_max                                 0
genders                                 0
custom_audiences                        0
excluded_custom_audiences               0
daily_budget                           53
lifetime_budget                        53
targeting_clean                         0
attribution_spec_clean                  0
age_max.1                               2
age_min.1                               2
excluded_product_audience_specs       179
flexible_spec    

In [4]:
# Drop columns with excessive missing values (e.g., more than 50%)
threshold = 0.5 * len(data)
data = data.dropna(axis=1, thresh=threshold)
print("\nRemaining columns after dropping those with excessive missing values:")
print(data.columns)


Remaining columns after dropping those with excessive missing values:
Index(['Unnamed: 0', 'id', 'name', 'campaign_id', 'status',
       'optimization_goal', 'billing_event', 'budget_remaining', 'start_time',
       'countries', 'age_min', 'age_max', 'genders', 'custom_audiences',
       'excluded_custom_audiences', 'daily_budget', 'lifetime_budget',
       'targeting_clean', 'attribution_spec_clean', 'age_max.1', 'age_min.1',
       'geo_locations', 'targeting_optimization', 'publisher_platforms',
       'facebook_positions', 'instagram_positions', 'locales',
       'targeting_relaxation_types', 'event_type', 'window_days'],
      dtype='object')


In [5]:
# Remove non-printable characters from column names and data
data.columns = [col.encode('ascii', 'ignore').decode('ascii').strip() for col in data.columns]
data = data.applymap(lambda x: x.encode('ascii', 'ignore').decode('ascii').strip() if isinstance(x, str) else x)


  data = data.applymap(lambda x: x.encode('ascii', 'ignore').decode('ascii').strip() if isinstance(x, str) else x)


In [5]:
# Display summary statistics for numeric columns
print("\nSummary statistics:")
print(data.describe())


Summary statistics:
       Unnamed: 0            id   campaign_id  budget_remaining     age_min  \
count  215.000000  2.150000e+02  2.150000e+02        215.000000  215.000000   
mean    58.190698  2.384853e+16  2.384848e+16        702.679070   32.925581   
std     32.621246  7.363295e+11  7.483391e+11       1145.191281    8.426374   
min      0.000000  2.384702e+16  2.384702e+16          0.000000   18.000000   
25%     31.000000  2.384824e+16  2.384824e+16        110.000000   25.000000   
50%     59.000000  2.384887e+16  2.384887e+16        110.000000   30.000000   
75%     86.000000  2.384911e+16  2.384910e+16        599.500000   40.000000   
max    113.000000  2.384930e+16  2.384930e+16       4500.000000   45.000000   

       age_max  daily_budget  lifetime_budget  age_max.1   age_min.1  \
count    215.0    162.000000            162.0      213.0  213.000000   
mean      65.0    932.567901              0.0       65.0   32.859155   
std        0.0   1235.907924              0.0      

In [6]:
# Identify and remove duplicate rows
print("\nNumber of duplicate rows before removal:", data.duplicated().sum())
data = data.drop_duplicates()
print("Number of duplicate rows after removal:", data.duplicated().sum())



Number of duplicate rows before removal: 0
Number of duplicate rows after removal: 0


In [12]:
import os

# Correct the directory path (with space)
output_dir = '/Users/manuel/Documents/GitHub/JeanPierreWeill/Data /MetaAdsOld'

# Ensure the directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the file path with the file name
cleaned_file_path = os.path.join(output_dir, 'cleaned_data.csv')

# Save the cleaned dataset
data.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to: {cleaned_file_path}")

# Display the first few rows of the cleaned dataset
print("\nCleaned dataset preview:")
print(data.head())



Cleaned dataset saved to: /Users/manuel/Documents/GitHub/JeanPierreWeill/Data /MetaAdsOld/cleaned_data.csv

Cleaned dataset preview:
   Unnamed: 0                 id                               name  \
0           0  23849302854290177       Auto - TOF - US + UK  #Stack   
1           0  23849302854290177       Auto - TOF - US + UK  #Stack   
2           1  23849302854200177  Auto - TOF - US + UK  #Stack - ON   
3           1  23849302854200177  Auto - TOF - US + UK  #Stack - ON   
4           2  23849302775990177          Feeds - TOF - US - #Stack   

         campaign_id  status    optimization_goal billing_event  \
0  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS   
1  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS   
2  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS   
3  23849302854210177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS   
4  23849302775960177  ACTIVE  OFFSITE_CONVERSIONS   IMPRESSIONS   

   budget_remaining                s