# Data Cleaning and Campaign Performance Preparation

This notebook processes Facebook and Instagram campaign datasets to prepare them for analytics tasks such as CTR, ROI, and engagement insights. It includes:
- Structural validation
- Data quality checks
- Correction of misaligned rows
- Forward-filling of missing campaign identifiers
- Export of a cleaned dataset ready for Power BI 



  Data Cleaning Notebook
This notebook loads, inspects, and cleans the uploaded `Raw_Data.csv`of Facebook Ad Campaign

In [1]:
# import library 
import pandas as pd



In [2]:
file_path = 'Data/Raw_Data.csv'

In [5]:
# load the raw_data for checking purpos
df= pd.read_csv('Data/Raw_Data.csv')

look at the data

In [6]:
df.shape


(1143, 15)

In [7]:

df.head()

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.0,1,1.43,2.0,1.0
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.0,2,1.82,2.0,0.0
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.0,0,0.0,1.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.0,1,1.25,1.0,0.0
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.0,1,1.29,1.0,1.0


In [8]:
df.dtypes

ad_id                    int64
reporting_start         object
reporting_end           object
campaign_id             object
fb_campaign_id          object
age                     object
gender                  object
interest1                int64
interest2                int64
interest3                int64
impressions            float64
clicks                   int64
spent                  float64
total_conversion       float64
approved_conversion    float64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ad_id                1143 non-null   int64  
 1   reporting_start      1143 non-null   object 
 2   reporting_end        1143 non-null   object 
 3   campaign_id          1143 non-null   object 
 4   fb_campaign_id       1143 non-null   object 
 5   age                  1143 non-null   object 
 6   gender               1143 non-null   object 
 7   interest1            1143 non-null   int64  
 8   interest2            1143 non-null   int64  
 9   interest3            1143 non-null   int64  
 10  impressions          1143 non-null   float64
 11  clicks               1143 non-null   int64  
 12  spent                1143 non-null   float64
 13  total_conversion     761 non-null    float64
 14  approved_conversion  761 non-null    float64
dtypes: float64(4), int64(5), object(6)
mem

In [10]:
# Count missing values per column

df.isna().sum()

ad_id                    0
reporting_start          0
reporting_end            0
campaign_id              0
fb_campaign_id           0
age                      0
gender                   0
interest1                0
interest2                0
interest3                0
impressions              0
clicks                   0
spent                    0
total_conversion       382
approved_conversion    382
dtype: int64

In [11]:
# Fill missing conversion values with 0
df["total_conversion"] = df["total_conversion"].fillna(0)
df["approved_conversion"] = df["approved_conversion"].fillna(0)

In [12]:
# convert date columns to datetime format

df['reporting_start'] = pd.to_datetime(df['reporting_start'], dayfirst=True, errors='coerce')
df['reporting_end']   = pd.to_datetime(df['reporting_end'], dayfirst=True, errors='coerce')

In [13]:
df.dtypes

ad_id                           int64
reporting_start        datetime64[ns]
reporting_end          datetime64[ns]
campaign_id                    object
fb_campaign_id                 object
age                            object
gender                         object
interest1                       int64
interest2                       int64
interest3                       int64
impressions                   float64
clicks                          int64
spent                         float64
total_conversion              float64
approved_conversion           float64
dtype: object

In [14]:
df.sample(5)


Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
1047,1122316,2017-08-26,2017-08-26,45-49,F,65,67,67,346688,88,114.86,2,0.0,0.0,0.0
156,747223,2017-08-18,2017-08-18,936,110838,30-34,M,16,18,18,8032.0,1,0.6,2.0,0.0
262,776840,2017-08-22,2017-08-22,936,115571,35-39,M,36,38,40,2797.0,1,1.29,1.0,0.0
285,777627,2017-08-23,2017-08-23,936,115715,45-49,M,16,17,19,157534.0,33,56.190001,2.0,0.0
969,1122112,2017-08-17,2017-08-17,40-44,F,27,31,33,1083259,276,390.259999,11,0.0,0.0,0.0


In [15]:
df['campaign_id'].unique()

array(['916', '936', '1178', '45-49', '30-34', '35-39', '40-44'],
      dtype=object)

In [16]:
df['age'].unique()

array(['30-34', '35-39', '40-44', '45-49', '10', '15', '16', '18', '19',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '31',
       '32', '36', '63', '64', '65', '2', '66', '30', '7', '100', '101',
       '102', '103', '105', '107', '110', '111', '112', '113', '108',
       '109', '114', '104', '106'], dtype=object)

In [17]:
df['gender'].unique()

array(['M', 'F', '14', '21', '19', '17', '20', '22', '24', '25', '23',
       '26', '27', '28', '29', '30', '33', '31', '32', '34', '35', '36',
       '37', '38', '68', '64', '65', '69', '67', '5', '71', '13', '18',
       '66', '8', '6', '10', '72', '15', '16', '70', '4', '9', '12', '41',
       '11', '106', '104', '107', '108', '112', '117', '116', '105',
       '110', '113', '114', '109', '115', '102', '103', '111', '118'],
      dtype=object)

 Detect Corrupted Rows

In [18]:
# to find the exact row
df.loc[750:770]

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
750,1121568,2017-08-20,2017-08-20,1178,144618,40-44,M,65,70,70,188440.0,40,60.73,2.0,1.0
751,1121571,2017-08-20,2017-08-20,1178,144619,40-44,M,2,3,5,212496.0,44,74.830001,2.0,1.0
752,1121572,2017-08-20,2017-08-20,1178,144619,40-44,M,2,6,3,32574.0,5,7.48,1.0,0.0
753,1121575,2017-08-20,2017-08-20,1178,144619,40-44,M,2,4,5,128595.0,23,36.480001,1.0,1.0
754,1121577,2017-08-20,2017-08-20,1178,144620,40-44,M,7,9,11,242234.0,48,68.060001,2.0,0.0
755,1121584,2017-08-20,2017-08-20,1178,144621,40-44,M,66,67,67,33154.0,5,7.88,1.0,1.0
756,1121585,2017-08-20,2017-08-20,1178,144621,40-44,M,66,72,68,9773.0,1,1.46,1.0,0.0
757,1121589,2017-08-20,2017-08-20,1178,144622,45-49,M,10,16,11,464036.0,77,123.55,3.0,1.0
758,1121590,2017-08-20,2017-08-20,1178,144622,45-49,M,10,16,15,478480.0,75,135.750001,3.0,1.0
759,1121592,2017-08-20,2017-08-20,1178,144622,45-49,M,10,14,11,428812.0,66,116.88,4.0,2.0


In [19]:
# Fix rows from 761 to end
rows_to_fix = df.iloc[761:].copy()

fixed = rows_to_fix.copy()

# SHIFT RIGHT BY 2 COLUMNS â€” starting at column index 3
fixed.iloc[:, 5:] = rows_to_fix.iloc[:, 3:-2].values   # correct shift
fixed.iloc[:, 3] = None
fixed.iloc[:, 4] = None

# Rebuild final dataframe
df_fixed = pd.concat([df.iloc[:761], fixed], ignore_index=True)


  fixed.iloc[:, 5:] = rows_to_fix.iloc[:, 3:-2].values   # correct shift
  fixed.iloc[:, 5:] = rows_to_fix.iloc[:, 3:-2].values   # correct shift


In [20]:
df_fixed.loc[755:770]

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
755,1121584,2017-08-20,2017-08-20,1178.0,144621.0,40-44,M,66,67,67,33154.0,5,7.88,1.0,1.0
756,1121585,2017-08-20,2017-08-20,1178.0,144621.0,40-44,M,66,72,68,9773.0,1,1.46,1.0,0.0
757,1121589,2017-08-20,2017-08-20,1178.0,144622.0,45-49,M,10,16,11,464036.0,77,123.55,3.0,1.0
758,1121590,2017-08-20,2017-08-20,1178.0,144622.0,45-49,M,10,16,15,478480.0,75,135.750001,3.0,1.0
759,1121592,2017-08-20,2017-08-20,1178.0,144622.0,45-49,M,10,14,11,428812.0,66,116.88,4.0,2.0
760,1121593,2017-08-26,2017-08-26,1178.0,144622.0,45-49,M,10,16,16,1177535.0,221,365.660001,15.0,3.0
761,1121594,2017-08-26,2017-08-26,,,45-49,M,10,14,14,426500.0,72,128.279999,4.0,1.0
762,1121597,2017-08-30,2017-08-30,,,45-49,M,15,21,19,54237.0,7,10.78,2.0,1.0
763,1121598,2017-08-30,2017-08-30,,,45-49,M,15,19,18,506916.0,89,133.699999,2.0,2.0
764,1121599,2017-08-30,2017-08-30,,,45-49,M,15,17,18,250960.0,42,64.88,2.0,0.0


In [21]:
df_fixed['campaign_id'] = (
    df_fixed['campaign_id']
    .ffill()
    .fillna("MISSING")
)

df_fixed['fb_campaign_id'] = (
    df_fixed['fb_campaign_id']
    .ffill()
    .fillna("MISSING")
)


In [None]:
# to verify the fix
df_fixed.loc[755:770]

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
755,1121584,2017-08-20,2017-08-20,1178,144621,40-44,M,66,67,67,33154.0,5,7.88,1.0,1.0
756,1121585,2017-08-20,2017-08-20,1178,144621,40-44,M,66,72,68,9773.0,1,1.46,1.0,0.0
757,1121589,2017-08-20,2017-08-20,1178,144622,45-49,M,10,16,11,464036.0,77,123.55,3.0,1.0
758,1121590,2017-08-20,2017-08-20,1178,144622,45-49,M,10,16,15,478480.0,75,135.750001,3.0,1.0
759,1121592,2017-08-20,2017-08-20,1178,144622,45-49,M,10,14,11,428812.0,66,116.88,4.0,2.0
760,1121593,2017-08-26,2017-08-26,1178,144622,45-49,M,10,16,16,1177535.0,221,365.660001,15.0,3.0
761,1121594,2017-08-26,2017-08-26,1178,144622,45-49,M,10,14,14,426500.0,72,128.279999,4.0,1.0
762,1121597,2017-08-30,2017-08-30,1178,144622,45-49,M,15,21,19,54237.0,7,10.78,2.0,1.0
763,1121598,2017-08-30,2017-08-30,1178,144622,45-49,M,15,19,18,506916.0,89,133.699999,2.0,2.0
764,1121599,2017-08-30,2017-08-30,1178,144622,45-49,M,15,17,18,250960.0,42,64.88,2.0,0.0


In [23]:
# Export cleaned file
df_fixed.to_csv("facebook_ads_cleaned.csv", index=False)