In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Direct CSV download from Hugging Face
url = "https://huggingface.co/datasets/AHFIDAILabs/merged_vaccines/resolve/main/merged_vaccines_final.csv"

df = pd.read_csv(url, low_memory=False)

In [3]:
df = pd.read_csv("merged_vaccines_final.csv")

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
df.shape

(48095, 57)

In [6]:
df.columns

Index(['Tweet ID', 'Tweet Text', 'Type', 'Author Name', 'Author Username',
       'Creation Time', 'Reply Count', 'Retweet Count', 'Quote Count',
       'Like Count', 'View Count', 'Bookmark Count', 'Language',
       'Possibly Sensitive', 'Source', 'Hashtags', 'Tweet URL', 'Media Type',
       'Media URLs', 'External URLs', 'id', 'tweetText', 'tweetURL', 'type',
       'tweetAuthor', 'handle', 'geo', 'mentions', 'hashtags', 'replyCount',
       'quoteCount', 'retweetCount', 'likeCount', 'views', 'bookmarkCount',
       'createdAt', 'allMediaURL', 'videoURL', 'User ID', 'Username', 'Name',
       'Location', 'Category', 'Number of Followers', 'Number of Following',
       'Number of Tweets', 'Number of Media', 'Number of Likes',
       'Number of Public Lists', 'Is Verified', 'Is Protected', 'Can DM',
       'Can Tag in Media', 'Biography', 'User Homepage', 'Avatar URL',
       'Profile Banner URL'],
      dtype='object')

In [7]:
#making a duplicate of the dataset

In [8]:
df1 = df.copy()

In [9]:
df.dtypes

Tweet ID                   object
Tweet Text                 object
Type                       object
Author Name                object
Author Username            object
Creation Time              object
Reply Count               float64
Retweet Count             float64
Quote Count               float64
Like Count                float64
View Count                float64
Bookmark Count            float64
Language                   object
Possibly Sensitive         object
Source                     object
Hashtags                   object
Tweet URL                  object
Media Type                 object
Media URLs                 object
External URLs              object
id                         object
tweetText                  object
tweetURL                   object
type                       object
tweetAuthor                object
handle                     object
geo                        object
mentions                   object
hashtags                   object
replyCount    

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48095 entries, 0 to 48094
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Tweet ID                28114 non-null  object 
 1   Tweet Text              44866 non-null  object 
 2   Type                    45814 non-null  object 
 3   Author Name             44198 non-null  object 
 4   Author Username         17964 non-null  object 
 5   Creation Time           17880 non-null  object 
 6   Reply Count             18152 non-null  float64
 7   Retweet Count           17444 non-null  float64
 8   Quote Count             17444 non-null  float64
 9   Like Count              17964 non-null  float64
 10  View Count              17444 non-null  float64
 11  Bookmark Count          17444 non-null  float64
 12  Language                17776 non-null  object 
 13  Possibly Sensitive      842 non-null    object 
 14  Source                  45814 non-null

In [11]:
df.isna().sum()

Tweet ID                  19981
Tweet Text                 3229
Type                       2281
Author Name                3897
Author Username           30131
Creation Time             30215
Reply Count               29943
Retweet Count             30651
Quote Count               30651
Like Count                30131
View Count                30651
Bookmark Count            30651
Language                  30319
Possibly Sensitive        47253
Source                     2281
Hashtags                  47740
Tweet URL                 20361
Media Type                47577
Media URLs                47577
External URLs             48039
id                        18542
tweetText                 46034
tweetURL                  46034
type                      45818
tweetAuthor               46034
handle                    46034
geo                       46397
mentions                  47490
hashtags                  46034
replyCount                46034
quoteCount                46034
retweetC

In [12]:
df.shape

(48095, 57)

In [13]:
# Drop columns with extremely high missing values
columns_to_drop = [
    'Number of Tweets', 'Number of Media', 'Number of Likes', 'Number of Public Lists',
    'Is Protected', 'Can Tag in Media', 'Category', 'Possibly Sensitive', 'Hashtags',
    'Media Type', 'Media URLs', 'External URLs', 'videoURL', 'allMediaURL',
    'User Homepage', 'mentions', 'Profile Banner URL', 'Avatar URL',
    'Can DM', 'id', 'tweetURL', 'createdAt', 'bookmarkCount', 'views', 'retweetCount',
    'likeCount', 'Quote Count', 'User ID', 'Tweet URL', 'Number of Following',
    'hashtags', 'replyCount', 'quoteCount', 'tweetText', 'Is Verified', 'Bookmark Count'
]

In [14]:
# Drop the columns
df_cl = df.drop(columns=columns_to_drop, errors='ignore')

# Check shape after dropping
print("Before:", df.shape)
print("After:", df_cl.shape)

# Verify columns removed
df_cl.info()

Before: (48095, 57)
After: (48095, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48095 entries, 0 to 48094
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Tweet ID             28114 non-null  object 
 1   Tweet Text           44866 non-null  object 
 2   Type                 45814 non-null  object 
 3   Author Name          44198 non-null  object 
 4   Author Username      17964 non-null  object 
 5   Creation Time        17880 non-null  object 
 6   Reply Count          18152 non-null  float64
 7   Retweet Count        17444 non-null  float64
 8   Like Count           17964 non-null  float64
 9   View Count           17444 non-null  float64
 10  Language             17776 non-null  object 
 11  Source               45814 non-null  object 
 12  type                 2277 non-null   object 
 13  tweetAuthor          2061 non-null   object 
 14  handle               2061 non-null   object 
 1

In [15]:
df_cl.columns

Index(['Tweet ID', 'Tweet Text', 'Type', 'Author Name', 'Author Username',
       'Creation Time', 'Reply Count', 'Retweet Count', 'Like Count',
       'View Count', 'Language', 'Source', 'type', 'tweetAuthor', 'handle',
       'geo', 'Username', 'Name', 'Location', 'Number of Followers',
       'Biography'],
      dtype='object')

In [None]:
Author Name, Author Username, Username, handle, Name, tweetAuthor

In [16]:
df_cl.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Type,Author Name,Author Username,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,tweetAuthor,handle,geo,Username,Name,Location,Number of Followers,Biography
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,Vaccine Network for Disease Control,VaccineNet_NG,"7/1/2024, 5:58:32 PM",1.0,0.0,2.0,81.0,en,Twitter Web App,,,,,,,,,
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,Vaccine Network for Disease Control,VaccineNet_NG,"7/1/2024, 3:02:55 PM",0.0,0.0,2.0,50.0,en,Twitter Web App,,,,,,,,,


In [17]:
df_cl.isnull().sum()

Tweet ID               19981
Tweet Text              3229
Type                    2281
Author Name             3897
Author Username        30131
Creation Time          30215
Reply Count            29943
Retweet Count          30651
Like Count             30131
View Count             30651
Language               30319
Source                  2281
type                   45818
tweetAuthor            46034
handle                 46034
geo                    46397
Username               30533
Name                   31053
Location               37261
Number of Followers    31053
Biography              33926
dtype: int64

In [18]:
df_cl.columns

Index(['Tweet ID', 'Tweet Text', 'Type', 'Author Name', 'Author Username',
       'Creation Time', 'Reply Count', 'Retweet Count', 'Like Count',
       'View Count', 'Language', 'Source', 'type', 'tweetAuthor', 'handle',
       'geo', 'Username', 'Name', 'Location', 'Number of Followers',
       'Biography'],
      dtype='object')

In [19]:
df_cl['Username'].isnull().sum()

np.int64(30533)

In [20]:
df_cl['Name'].isnull().sum()

np.int64(31053)

In [21]:
#df_cl['Name'].value_counts()

In [22]:
df_cl['Author Name'].isnull().sum()

np.int64(3897)

# filling the missing usernames with the traces and fall backs

In [24]:
import pandas as pd
import numpy as np

def consolidate_username(df): 
    # Define all possible username column names in priority order
    username_columns = [
        'Username',
        'Author Username', 
        'handle',
        'tweetAuthor',
        'Author Name',
        'Name'
    ]
    
    # Filter to only columns that actually exist in the dataframe
    existing_columns = [col for col in username_columns if col in df.columns]
    
    if not existing_columns:
        raise ValueError("No username columns found in the dataframe!")
    
    print(f"Found these username columns: {existing_columns}\n")
    
    # Create a copy to avoid modifying original
    df_copy = df.copy()
    
    # If 'Username' doesn't exist, create it
    if 'Username' not in df_copy.columns:
        df_copy['Username'] = None
        print("Created new 'Username' column\n")
    
    # Convert all username columns to string and handle empty values
    for col in existing_columns:
        df_copy[col] = df_copy[col].astype(str)
        # Replace various empty representations with actual NaN
        df_copy[col] = df_copy[col].replace(['', 'None', 'nan', 'NaN', 'null', 'NA'], np.nan)
        # Strip whitespace
        df_copy[col] = df_copy[col].str.strip()
        # Replace empty strings after stripping
        df_copy[col] = df_copy[col].replace('', np.nan)
    
    # Count before consolidation
    before_empty = df_copy['Username'].isna().sum()
    print(f"Empty 'Username' cells BEFORE consolidation: {before_empty}")

    for idx, row in df_copy.iterrows():
        # Check if Username is empty for this row
        if pd.isna(row['Username']) or row['Username'] == '':
            # Try each fallback column in order until we find a non-empty value
            for col in ['Author Username', 'handle', 'tweetAuthor', 'Author Name', 'Name']:
                if col in df_copy.columns:
                    value = row[col]
                    # If this column has a value for this row, use it
                    if pd.notna(value) and value != '':
                        df_copy.at[idx, 'Username'] = value
                        break  # Stop once we found a value for this row
    
    # Count after consolidation
    after_empty = df_copy['Username'].isna().sum()
    filled_count = before_empty - after_empty
    
    print(f"Filled {filled_count} empty cells using fallback columns")
    print(f"Empty 'Username' cells AFTER consolidation: {after_empty}\n")
    
    # Drop the other username columns (keep only 'Username')
    columns_to_drop = [col for col in existing_columns if col != 'Username']
    df_copy = df_copy.drop(columns=columns_to_drop, errors='ignore')
    
    # Final report
    total_rows = len(df_copy)
    filled_rows = df_copy['Username'].notna().sum()
    empty_rows = df_copy['Username'].isna().sum()
    
    print("="*60)
    print("USERNAME CONSOLIDATION COMPLETE!")
    print("="*60)
    print(f"Total rows: {total_rows}")
    print(f"Filled usernames: {filled_rows} ({filled_rows/total_rows*100:.2f}%)")
    print(f"Empty usernames: {empty_rows} ({empty_rows/total_rows*100:.2f}%)")
    
    if empty_rows > 0:
        print(f"\n  Warning: {empty_rows} rows still have empty usernames.")
        print("These rows had no username data in ANY of the source columns.")
    else:
        print("\n SUCCESS! No empty usernames - all rows have been filled!")
    
    # Show sample to verify single username per cell
    print("\n" + "="*60)
    print("SAMPLE OF CONSOLIDATED USERNAMES:")
    print("="*60)
    print(df_copy['Username'].head(15))
    
    return df_copy


In [25]:
df_cl = consolidate_username(df_cl)


Found these username columns: ['Username', 'Author Username', 'handle', 'tweetAuthor', 'Author Name', 'Name']

Empty 'Username' cells BEFORE consolidation: 30533
Filled 29437 empty cells using fallback columns
Empty 'Username' cells AFTER consolidation: 1096

USERNAME CONSOLIDATION COMPLETE!
Total rows: 48095
Filled usernames: 46999 (97.72%)
Empty usernames: 1096 (2.28%)

These rows had no username data in ANY of the source columns.

SAMPLE OF CONSOLIDATED USERNAMES:
0     VaccineNet_NG
1     VaccineNet_NG
2     VaccineNet_NG
3     VaccineNet_NG
4     VaccineNet_NG
5     VaccineNet_NG
6     VaccineNet_NG
7     VaccineNet_NG
8     VaccineNet_NG
9     VaccineNet_NG
10    VaccineNet_NG
11    VaccineNet_NG
12    VaccineNet_NG
13    VaccineNet_NG
14    VaccineNet_NG
Name: Username, dtype: object


In [32]:
df_cl['Username'].isnull().sum()

np.int64(1096)

In [33]:
df_cl['Username'] = df_cl['Username'].fillna('Unknown_User')

In [34]:
df_cl['Username'].isnull().sum()

np.int64(0)

In [35]:
df_cl.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,"7/1/2024, 5:58:32 PM",1.0,0.0,2.0,81.0,en,Twitter Web App,,,VaccineNet_NG,,,
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,"7/1/2024, 3:02:55 PM",0.0,0.0,2.0,50.0,en,Twitter Web App,,,VaccineNet_NG,,,


In [36]:
df_cl['Tweet Text'].isnull().sum()

np.int64(3229)

In [37]:
df_cl['Biography'].isnull().sum()

np.int64(33926)

In [38]:
#filling the tweet text

In [39]:

# Count before
before = df_cl['Tweet Text'].isnull().sum()
print(f"Empty 'Tweet Text' BEFORE: {before}")

# Fill empty Tweet Text with Biography from same row
df_cl['Tweet Text'] = df_cl['Tweet Text'].fillna(df_cl['Biography'])

Empty 'Tweet Text' BEFORE: 3229


In [40]:

# Count after
after = df_cl['Tweet Text'].isnull().sum()
filled = before - after

print(f"Filled: {filled}")
print(f"Empty 'Tweet Text' AFTER: {after}")
print(f"‚úÖ Done!")

Filled: 153
Empty 'Tweet Text' AFTER: 3076
‚úÖ Done!


In [None]:
Empty 'Tweet Text' BEFORE: 3229

Filled: 153
Empty 'Tweet Text' AFTER: 3076
‚úÖ Done!

In [41]:
df_cl.shape

(48095, 16)

In [42]:
df_cl.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,"7/1/2024, 5:58:32 PM",1.0,0.0,2.0,81.0,en,Twitter Web App,,,VaccineNet_NG,,,
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,"7/1/2024, 3:02:55 PM",0.0,0.0,2.0,50.0,en,Twitter Web App,,,VaccineNet_NG,,,


In [44]:
df_cl['Creation Time'].isnull().sum()

np.int64(30215)

In [55]:
#df_cl['Creation Time'].value_counts()

In [46]:
#splitting the data and time to have differnet columns

In [52]:
def split_creation_time(df_cl):
    # If not datetime, convert it
    if not np.issubdtype(df_cl['Creation Time'].dtype, np.datetime64):
        df_cl['Creation Time'] = pd.to_datetime(df_cl['Creation Time'], errors='coerce')

    # Remove timezone if present
    df_cl['Creation Time'] = df_cl['Creation Time'].dt.tz_localize(None)
    
    # Extract components
    df_cl['date'] = df_cl['Creation Time'].dt.date
    df_cl['time'] = df_cl['Creation Time'].dt.time
    df_cl['year'] = df_cl['Creation Time'].dt.year
    df_cl['month'] = df_cl['Creation Time'].dt.month
    df_cl['day'] = df_cl['Creation Time'].dt.day
    df_cl['hour'] = df_cl['Creation Time'].dt.hour
    df_cl['minute'] = df_cl['Creation Time'].dt.minute
    df_cl['weekday'] = df_cl['Creation Time'].dt.day_name()

    return df_cl


In [53]:
df_cl = split_creation_time(df_cl)

In [58]:
df_cl.head(2)


Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,en,Twitter Web App,,,VaccineNet_NG,,,,2024-07-01,17:58:32,2024.0,7.0,1.0,17.0,58.0,Monday
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,en,Twitter Web App,,,VaccineNet_NG,,,,2024-07-01,15:02:55,2024.0,7.0,1.0,15.0,2.0,Monday


In [67]:
#

In [63]:
df_cl['month'].isnull().sum()

np.int64(30431)

In [64]:
df_cl['month'].value_counts()

month
11.0    4468
4.0     2115
10.0    1640
6.0     1525
12.0    1412
8.0     1222
3.0     1217
9.0     1211
2.0      903
7.0      884
5.0      576
1.0      491
Name: count, dtype: int64

In [61]:
df_cl['year'].isnull().sum()

np.int64(30431)

In [62]:
df_cl['year'].value_counts()

year
2024.0    7193
2025.0    6516
2021.0    2079
2023.0    1472
2022.0     209
2019.0      90
2015.0      14
2013.0      13
2016.0      12
2020.0      12
2011.0      10
2012.0       9
2014.0       9
2017.0       7
2018.0       7
2010.0       6
2009.0       5
2008.0       1
Name: count, dtype: int64

In [57]:
df_cl['Language'].isnull().sum()

np.int64(30319)

In [68]:
#filling teh dates ad months intelligentlyt 

In [69]:
def fill_datetime(df):
    # Step 1: Ensure Creation Time is datetime
    df['Creation Time'] = pd.to_datetime(df['Creation Time'], errors='coerce')

    # Step 2: Separate missing and non-missing rows
    missing_mask = df['Creation Time'].isna()
    df_valid = df.loc[~missing_mask].copy()
    df_missing = df.loc[missing_mask].copy()

    # If all timestamps are missing, exit
    if df_valid.empty:
        raise ValueError("No valid timestamps to learn from")

    # Step 3: Sample replacement timestamps from the valid ones
    sampled_times = np.random.choice(df_valid['Creation Time'], size=df_missing.shape[0], replace=True)
    df_missing['Creation Time'] = sampled_times

    # Step 4: Combine back
    df_filled = pd.concat([df_valid, df_missing]).sort_index()

    # Step 5: Extract datetime features
    df_filled['date'] = df_filled['Creation Time'].dt.date
    df_filled['time'] = df_filled['Creation Time'].dt.time
    df_filled['year'] = df_filled['Creation Time'].dt.year
    df_filled['month'] = df_filled['Creation Time'].dt.month
    df_filled['day'] = df_filled['Creation Time'].dt.day
    df_filled['hour'] = df_filled['Creation Time'].dt.hour
    df_filled['minute'] = df_filled['Creation Time'].dt.minute
    df_filled['weekday'] = df_filled['Creation Time'].dt.day_name()

    return df_filled


In [70]:
df_cl = fill_datetime(df_cl)

In [72]:
df_cl.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,en,Twitter Web App,,,VaccineNet_NG,,,,2024-07-01,17:58:32,2024,7,1,17,58,Monday
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,en,Twitter Web App,,,VaccineNet_NG,,,,2024-07-01,15:02:55,2024,7,1,15,2,Monday


# filling the numeric counts

In [101]:
def fill_numeric_counts(df):
    num_cols = ['Reply Count', 'Retweet Count', 'Like Count', 'View Count', 'Number of Followers']
    
    for col in num_cols:
        if col in df.columns:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    return df

In [102]:
df_cl = fill_numeric_counts(df_cl)

In [109]:
df_cl['Source'].isnull().sum()

np.int64(2281)

In [106]:
df_cl['Source'].value_counts()

Source
Facebook               26754
Twitter                16862
TikTok                  1252
Instagram                318
Twitter Web App          311
Twitter for Android      311
Blogs                      6
Name: count, dtype: int64

# standardize Source column

In [110]:
def standardize_and_fill_source(df, source_col='Source', random_state=42):
    np.random.seed(random_state)
    
    # Step 1: map known variants to canonical labels
    mapping = {
        'Twitter Web App': 'Twitter',
        'Twitter for Android': 'Twitter'
    }
    
    df[source_col] = df[source_col].replace(mapping)
    
    # Step 2: ensure only main categories remain
    main_categories = ['Facebook', 'Twitter', 'TikTok', 'Instagram', 'Blogs']
    
    # Any entry not in main_categories becomes NaN
    df[source_col] = df[source_col].where(df[source_col].isin(main_categories), np.nan)
    
    # Step 3: fill missing values by sampling from the observed distribution
    missing_mask = df[source_col].isna()
    if missing_mask.any():
        observed = df.loc[~missing_mask, source_col]
        if not observed.empty:
            probs = observed.value_counts(normalize=True)
            choices = probs.index.tolist()
            probs_list = probs.values.tolist()
            sampled = np.random.choice(choices, size=missing_mask.sum(), p=probs_list, replace=True)
        else:
            # fallback if no observed data
            sampled = np.random.choice(main_categories, size=missing_mask.sum(), replace=True)
        df.loc[missing_mask, source_col] = sampled
    
    # Optional: convert to categorical with ordered categories
    df[source_col] = pd.Categorical(df[source_col], categories=main_categories)
    
    return df

In [111]:
df_cl = standardize_and_fill_source(df_cl)

In [112]:
print(df_cl['Source'].value_counts())


Source
Facebook     28061
Twitter      18375
TikTok        1320
Instagram      333
Blogs            6
Name: count, dtype: int64


In [113]:
print('Missing after fill:', df_cl['Source'].isnull().sum())


Missing after fill: 0


In [114]:
df_cl.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,en,Twitter,,,VaccineNet_NG,,707.0,,2024-07-01,17:58:32,2024,7,1,17,58,Monday
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,en,Twitter,,,VaccineNet_NG,,707.0,,2024-07-01,15:02:55,2024,7,1,15,2,Monday


In [128]:
df_cl.shape

(48095, 24)

In [129]:
df_cl['Location'].isnull().sum()

np.int64(37261)

In [130]:
df_cl['geo'].isnull().sum()

np.int64(46397)

In [None]:
#dealing with teh location and geo columns

In [131]:
def fill_location_from_geo(df, location_col='Location', geo_col='geo'):
    """
    Fill missing values in 'Location' column by tracing from 'geo' column.
    Only fills where there is corresponding geo data; leaves remaining missing as NaN.
    """
    missing_mask = df[location_col].isnull()
    
    # Fill from geo where available
    df.loc[missing_mask & df[geo_col].notnull(), location_col] = df.loc[missing_mask & df[geo_col].notnull(), geo_col]
    
    # Report how many are still missing
    remaining_missing = df[location_col].isnull().sum()
    print(f"Remaining missing in '{location_col}': {remaining_missing}")
    
    return df

In [132]:
df_cl = fill_location_from_geo(df_cl)

Remaining missing in 'Location': 35563


In [133]:
df_cl['Location'].isnull().sum()  # Check remaining missing

np.int64(35563)

In [138]:
def fill_location(df, location_col='Location', country_focus='Nigeria', random_state=42):
    """
    Fill missing Location values intelligently for sentiment analysis.
    
    - Prioritizes Nigerian locations.
    - Samples remaining missing values based on observed distribution.
    - Any remaining missing values are set to 'Unknown'.
    """
    np.random.seed(random_state)
    
    # Identify missing locations
    missing_mask = df[location_col].isnull()
    remaining_missing = missing_mask.sum()
    
    if remaining_missing == 0:
        print("No missing locations to fill.")
        return df
    
    # Get all available locations
    available_locs = df[location_col].dropna()
    
    if not available_locs.empty:
        # Split Nigerian and other locations
        nigeria_mask = available_locs.str.contains(country_focus, case=False, na=False)
        nigeria_locs = available_locs[nigeria_mask]
        other_locs = available_locs[~nigeria_mask]
        
        # If Nigerian locations exist, sample mostly from Nigeria
        if not nigeria_locs.empty:
            sample_pool = pd.concat([nigeria_locs]*7 + [other_locs]*3)
        else:
            sample_pool = available_locs
        
        # Randomly assign to missing values
        sampled_values = np.random.choice(sample_pool, size=remaining_missing, replace=True)
        df.loc[missing_mask, location_col] = sampled_values
    
    # Replace any remaining missing locations with 'Unknown'
    still_missing = df[location_col].isnull().sum()
    if still_missing > 0:
        df[location_col] = df[location_col].fillna('Unknown')
        print(f"Filled {still_missing} remaining missing locations with 'Unknown'.")
    
    print(f"All missing values in '{location_col}' have been filled.")
    return df



In [139]:
# Apply the function
df_cl = fill_location(df_cl)



All missing values in 'Location' have been filled.


In [145]:
# Check result
df_cl['Location'].value_counts()

Location
Nigeria                                                                  5116
Lagos, Nigeria                                                           3943
Abuja, Nigeria                                                           2264
United States                                                             801
Canada                                                                    449
USA                                                                       429
Florida, USA                                                              412
Lagos                                                                     366
Earth                                                                     363
Enugu, Nigeria                                                            293
Lagos Nigeria                                                             248
India                                                                     239
Federal Capital Territory, Nig                         

In [150]:
df_cl.head(15)

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,en,Twitter,,,VaccineNet_NG,"Lagos, Nigeria",707.0,,2024-07-01,17:58:32,2024,7,1,17,58,Monday
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,en,Twitter,,,VaccineNet_NG,Abuja. Nigeria.,707.0,,2024-07-01,15:02:55,2024,7,1,15,2,Monday
2,"=""1806318372742635560""",Blessing is protected against cervical cancer....,Tweet,2024-06-27 14:27:14,0.0,1.0,6.0,137.0,en,Twitter,,,VaccineNet_NG,Lagos,707.0,,2024-06-27,14:27:14,2024,6,27,14,27,Thursday
3,"=""1806318348721954865""",https://t.co/wirqlPgG1q,Tweet,2024-06-27 14:27:08,0.0,0.0,4.0,46.0,zxx,Twitter,,,VaccineNet_NG,Well informed,707.0,,2024-06-27,14:27:08,2024,6,27,14,27,Thursday
4,"=""1805543612580606082""",@science_nigeria @nighealthwatch @Fmohnigeria ...,Reply,2024-06-25 11:08:37,0.0,0.0,0.0,5.0,en,Twitter,,,VaccineNet_NG,Nigeria,707.0,,2024-06-25,11:08:37,2024,6,25,11,8,Tuesday
5,"=""1805540309104771446""","Our CEO, Chika Offor, is joining other health ...",Tweet,2024-06-25 10:55:29,0.0,0.0,4.0,60.0,en,Twitter,,,VaccineNet_NG,Nigeria,707.0,,2024-06-25,10:55:29,2024,6,25,10,55,Tuesday
6,"=""1804492834000531887""",@VaccineNet_NG Engages Gombe Stakeholders On ...,Tweet,2024-06-22 13:33:12,0.0,0.0,2.0,40.0,en,Twitter,,,VaccineNet_NG,"Lagos, Nigeria.",707.0,,2024-06-22,13:33:12,2024,6,22,13,33,Saturday
7,"=""1804491510454301127""",7m Nigerian girls vaccinated against cervical ...,Tweet,2024-06-22 13:27:56,0.0,0.0,3.0,46.0,en,Twitter,,,VaccineNet_NG,Global,707.0,,2024-06-22,13:27:56,2024,6,22,13,27,Saturday
8,"=""1803394080082190414""",Today is the International Day for the Elimina...,Tweet,2024-06-19 12:47:08,0.0,0.0,4.0,64.0,en,Twitter,,,VaccineNet_NG,Inside your house,707.0,,2024-06-19,12:47:08,2024,6,19,12,47,Wednesday
9,"=""1802357799202029705""",Happy Father's Day to all the amazing dads out...,Tweet,2024-06-16 16:09:20,0.0,0.0,2.0,49.0,en,Twitter,,,VaccineNet_NG,odisha,707.0,,2024-06-16,16:09:20,2024,6,16,16,9,Sunday


In [153]:
df_cl['Language'].isnull().sum()

np.int64(30319)

# Handling the Language column

In [148]:
df_cl['Language'].unique()

array(['en', 'zxx', 'qme', nan, 'da', 'in', 'und', 'qam', 'fi', 'hu',
       'tl', 'tr', 'es', 'ro', 'et', 'fr', 'hi', 'vi', 'de', 'ru', 'pt',
       'ta', 'no', 'ht', 'ca', 'ja', 'sv', 'cy', 'it', 'nl', 'pl', 'lv',
       'is', 'un'], dtype=object)

In [159]:
from langdetect import detect, DetectorFactory
import langid
import re

In [160]:
DetectorFactory.seed = 0 

def detect_language(text):
    try:
        text = str(text).strip()

        # empty text
        if text == "" or text.lower() == "nan":
            return "unk"

        # custom checks for Nigerian languages
        yoruba_words = ['·π£', 'gb', '·π£√©', '·∫π', '≈Ñ', '·ªç', '√†', '√¨']
        igbo_words = ['·ªã', '·ª•', '·ªç', '≈Ñ', 'ga-', 'na-', 'mm', 'kwu']
        hausa_words = ['na', 'kai', 'wannan', 'ina', 'sannu', 'yaya']

        # Yoruba detection
        if any(w in text.lower() for w in yoruba_words):
            return "yo"

        # Igbo detection
        if any(w in text.lower() for w in igbo_words):
            return "ig"

        # Hausa detection
        if any(w in text.lower().split() for w in hausa_words):
            return "ha"

        # Try langdetect
        try:
            ld = detect(text)
        except:
            ld = None

        # Try langid
        li, _ = langid.classify(text)

        # Decision logic
        if ld == li:
            return ld
        
        # If langdetect returns English confidently
        if ld == "en":
            return "en"

        # If langid returns English for short text
        if li == "en":
            return "en"

        # fallback
        return li if li is not None else "unk"

    except:
        return "unk"

In [161]:
# treat invalid codes as missing
invalid_codes = ['und', 'un', 'zxx', 'qme', 'qam']

df_cl['Language'] = df_cl['Language'].replace(invalid_codes, None)

# detect language only for missing ones
df_cl['Language'] = df_cl.apply(
    lambda row: detect_language(row['Tweet Text']) if pd.isna(row['Language']) else row['Language'],
    axis=1
)

In [162]:
lang_map = {
    'en':'English','fr':'French','es':'Spanish','de':'German','pt':'Portuguese',
    'it':'Italian','ru':'Russian','ja':'Japanese','hi':'Hindi','ta':'Tamil','vi':'Vietnamese','tr':'Turkish','pl':'Polish','nl':'Dutch',
    'sv':'Swedish','no':'Norwegian','da':'Danish','fi':'Finnish','ro':'Romanian','hu':'Hungarian','ca':'Catalan','cy':'Welsh',
    'et':'Estonian','lv':'Latvian','is':'Icelandic','ht':'Haitian Creole','tl':'Tagalog','in':'Indonesian',

    # Nigerian languages
    'yo':'Yoruba',
    'ig':'Igbo',
    'ha':'Hausa',
    'pcm':'Nigerian Pidgin',

    # unknown
    'unk':'Unknown'
}

df_cl['Language_Full'] = df_cl['Language'].map(lang_map).fillna("Unknown")


In [163]:
df_cl['Language'].isnull().sum(), df_cl['Language_Full'].isnull().sum()


(np.int64(0), np.int64(0))

In [170]:
df_cl['Language'].unique()

array(['English', 'German', 'Igbo', 'Unknown', 'Spanish', 'Yoruba',
       'Italian', 'French', 'Danish', 'Indonesian', 'Amharic', 'Finnish',
       'Hungarian', 'Tagalog', 'Turkish', 'Romanian', 'Georgian',
       'Croatian', 'Maltese', 'Xhosa', 'Estonian', 'Basque', 'Chinese',
       'Norwegian', 'Kinyarwanda', 'Khmer', 'Hindi', 'Vietnamese',
       'Portuguese', 'Japanese', 'Russian', 'Norwegian Nynorsk', 'Irish',
       'Tamil', 'Latvian', 'Marathi', 'Dutch', 'Haitian Creole',
       'Catalan', 'Hebrew', 'Swedish', 'Welsh', 'Slovenian', 'Afrikaans',
       'Quechua', 'Polish', 'Kannada', 'Albanian', 'Icelandic', 'Sinhala',
       'Javanese', 'Malagasy', 'Malay', 'Esperanto', 'Slovak', 'Hausa',
       'Zulu', 'Nepali', 'Persian', 'Gujarati', 'Ukrainian', 'Swahili',
       'Breton', 'Korean', 'Dzongkha', 'Punjabi', 'Volapuk', 'Lithuanian',
       'Azerbaijani', 'Greek', 'Northern Sami', 'Arabic', 'Latin',
       'Czech', 'Luxembourgish', 'Faroese', 'Norwegian Bokmal',
       'Belarus

In [168]:
language_mapping_full = {
    'en': 'English',
    'de': 'German',
    'ig': 'Igbo',
    'unk': 'Unknown',
    'es': 'Spanish',
    'yo': 'Yoruba',
    'it': 'Italian',
    'fr': 'French',
    'da': 'Danish',
    'in': 'Indonesian',
    'am': 'Amharic',
    'fi': 'Finnish',
    'hu': 'Hungarian',
    'tl': 'Tagalog',
    'tr': 'Turkish',
    'ro': 'Romanian',
    'ka': 'Georgian',
    'hr': 'Croatian',
    'mt': 'Maltese',
    'xh': 'Xhosa',
    'et': 'Estonian',
    'eu': 'Basque',
    'zh': 'Chinese',
    'no': 'Norwegian',
    'rw': 'Kinyarwanda',
    'km': 'Khmer',
    'hi': 'Hindi',
    'vi': 'Vietnamese',
    'pt': 'Portuguese',
    'ja': 'Japanese',
    'ru': 'Russian',
    'nn': 'Norwegian Nynorsk',
    'ga': 'Irish',
    'ta': 'Tamil',
    'lv': 'Latvian',
    'mr': 'Marathi',
    'nl': 'Dutch',
    'ht': 'Haitian Creole',
    'ca': 'Catalan',
    'he': 'Hebrew',
    'sv': 'Swedish',
    'cy': 'Welsh',
    'sl': 'Slovenian',
    'af': 'Afrikaans',
    'qu': 'Quechua',
    'pl': 'Polish',
    'kn': 'Kannada',
    'sq': 'Albanian',
    'is': 'Icelandic',
    'si': 'Sinhala',
    'jv': 'Javanese',
    'id': 'Indonesian',
    'mg': 'Malagasy',
    'ms': 'Malay',
    'eo': 'Esperanto',
    'sk': 'Slovak',
    'ha': 'Hausa',
    'zu': 'Zulu',
    'ne': 'Nepali',
    'fa': 'Persian',
    'gu': 'Gujarati',
    'uk': 'Ukrainian',
    'sw': 'Swahili',
    'br': 'Breton',
    'ko': 'Korean',
    'dz': 'Dzongkha',
    'pa': 'Punjabi',
    'vo': 'Volapuk',
    'lt': 'Lithuanian',
    'az': 'Azerbaijani',
    'el': 'Greek',
    'se': 'Northern Sami',
    'ar': 'Arabic',
    'la': 'Latin',
    'cs': 'Czech',
    'lb': 'Luxembourgish',
    'fo': 'Faroese',
    'nb': 'Norwegian Bokmal',
    'be': 'Belarusian',
    'or': 'Odia',
    'bs': 'Bosnian',
    'gl': 'Galician',
    'ur': 'Urdu',
    'th': 'Thai',
    'an': 'Aragonese',
    'ku': 'Kurdish',
    'hy': 'Armenian',
    'oc': 'Occitan',
    'ml': 'Malayalam',
    'bn': 'Bengali',
    'as': 'Assamese'
}


In [169]:
df_cl['Language'] = df_cl['Language'].apply(
    lambda x: language_mapping_full.get(x, x)
)

In [175]:
df_cl.isnull().sum()

Tweet ID               19981
Tweet Text              3076
Type                    2281
Creation Time              0
Reply Count                0
Retweet Count              0
Like Count                 0
View Count                 0
Language                   0
Source                     0
type                   45818
geo                    46397
Username                   0
Location                   0
Number of Followers        0
Biography              33926
date                       0
time                       0
year                       0
month                      0
day                        0
hour                       0
minute                     0
weekday                    0
Language_Full              0
dtype: int64

In [178]:
df_cl['Tweet ID'].value_counts()

Tweet ID
Y29tbWVudDoxMTkxOTg4OTczMDI5MjI5XzE1OTQzODU1MzQ4ODA2Njc=        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfNTE5NTk1MDM3Mjc3MzA2            4
Y29tbWVudDoxMTkxOTg4OTczMDI5MjI5XzE1Mzc4MDkzMTA4OTQ0MjE=        4
Y29tbWVudDoxMTkxOTg4OTczMDI5MjI5XzUwNjc1MjUwOTAxMzg1OTc=        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfNjY5ODQyODg1NzA2ODM4            4
Y29tbWVudDoxMTkxOTg4OTczMDI5MjI5XzE0ODA2OTMyMzk2ODIzOTc=        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfNDYwMjc0OTc3MDE5NTQ2            4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMzA4MzgwOTI2ODQ0MjEwMw==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTE1NzQyNjcxMjU4OTUwOA==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTY4MjU3ODcyOTI4MzgxNw==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfNDA0Mzg2NDk3NTgzNDcxNA==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTYyMTM3NTAyODQ5NDA0Ng==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTQwMjcyNjU0MTEzNzI1Mg==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTc3OTIxMzMwMjkyNjAwMA==        4
Y29tbWVudDo5MjM4MzMwMzk5MTkzMTJfMTU5NTU1MzU2NDY1NzEwOA==        4
Y

In [171]:
df_cl.head()

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday,Language_Full
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,English,Twitter,,,VaccineNet_NG,"Lagos, Nigeria",707.0,,2024-07-01,17:58:32,2024,7,1,17,58,Monday,English
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,English,Twitter,,,VaccineNet_NG,Abuja. Nigeria.,707.0,,2024-07-01,15:02:55,2024,7,1,15,2,Monday,English
2,"=""1806318372742635560""",Blessing is protected against cervical cancer....,Tweet,2024-06-27 14:27:14,0.0,1.0,6.0,137.0,English,Twitter,,,VaccineNet_NG,Lagos,707.0,,2024-06-27,14:27:14,2024,6,27,14,27,Thursday,English
3,"=""1806318348721954865""",https://t.co/wirqlPgG1q,Tweet,2024-06-27 14:27:08,0.0,0.0,4.0,46.0,German,Twitter,,,VaccineNet_NG,Well informed,707.0,,2024-06-27,14:27:08,2024,6,27,14,27,Thursday,German
4,"=""1805543612580606082""",@science_nigeria @nighealthwatch @Fmohnigeria ...,Reply,2024-06-25 11:08:37,0.0,0.0,0.0,5.0,English,Twitter,,,VaccineNet_NG,Nigeria,707.0,,2024-06-25,11:08:37,2024,6,25,11,8,Tuesday,English


# Handline the Tweet ID

In [181]:
import hashlib

In [182]:
def add_master_id_to_df(df):
    """
    Adds Master_ID and ID_Source columns directly to the given dataframe.
    - Master_ID: Unique identifier for each row (uses Tweet ID if present, else synthetic hash)
    - ID_Source: Describes the source of the ID (Platform_ID or Synthetic_ID)
    The new columns are placed at the front of the dataframe.
    """
    
    def create_master_id(row, index):
        """Generate unique Master ID"""
        if pd.notna(row.get('Tweet ID')):
            return str(row['Tweet ID'])
        # Synthetic ID using key row content + index
        content = (
            str(row.get('Tweet Text', '')) +
            str(row.get('Username', '')) +
            str(row.get('time', '')) +
            str(row.get('year', '')) +
            str(index)
        )
        hash_value = hashlib.sha256(content.encode()).hexdigest()[:16]
        return f"SYN_{hash_value}"

    # Create Master_ID
    df['Master_ID'] = [create_master_id(row, idx) for idx, row in df.iterrows()]

    # Create ID_Source
    df['ID_Source'] = df.apply(
        lambda row: 'Platform_ID' if pd.notna(row.get('Tweet ID')) else 'Synthetic_ID',
        axis=1
    )

    # Move new columns to the front
    cols = ['Master_ID', 'ID_Source'] + [c for c in df.columns if c not in ['Master_ID', 'ID_Source']]
    df = df[cols]

    # Update df in-place
    for col in df.columns:
        df_cl[col] = df[col]

    # Verify uniqueness
    total_records = len(df)
    unique_ids = df['Master_ID'].nunique()
    all_unique = unique_ids == total_records
    print(f"Total records: {total_records}")
    print(f"Unique Master IDs: {unique_ids}")
    print(f"All IDs unique: {all_unique}")

    # Show distribution of ID source
    print("\nID Source Distribution:")
    print(df['ID_Source'].value_counts())

# Apply to df_cl directly
add_master_id_to_df(df_cl)


Total records: 48095
Unique Master IDs: 32715
All IDs unique: False

ID Source Distribution:
ID_Source
Platform_ID     28114
Synthetic_ID    19981
Name: count, dtype: int64


In [184]:
df_cl['ID_Source'].value_counts()

ID_Source
Platform_ID     28114
Synthetic_ID    19981
Name: count, dtype: int64

In [183]:
df_cl.head()

Unnamed: 0,Tweet ID,Tweet Text,Type,Creation Time,Reply Count,Retweet Count,Like Count,View Count,Language,Source,type,geo,Username,Location,Number of Followers,Biography,date,time,year,month,day,hour,minute,weekday,Language_Full,Master_ID,ID_Source
0,"=""1807821100576895401""",The HPV vaccine is a critical step in preventi...,Tweet,2024-07-01 17:58:32,1.0,0.0,2.0,81.0,English,Twitter,,,VaccineNet_NG,"Lagos, Nigeria",707.0,,2024-07-01,17:58:32,2024,7,1,17,58,Monday,English,"=""1807821100576895401""",Platform_ID
1,"=""1807776904805580853""","New month, new experiences, new opportunities,...",Tweet,2024-07-01 15:02:55,0.0,0.0,2.0,50.0,English,Twitter,,,VaccineNet_NG,Abuja. Nigeria.,707.0,,2024-07-01,15:02:55,2024,7,1,15,2,Monday,English,"=""1807776904805580853""",Platform_ID
2,"=""1806318372742635560""",Blessing is protected against cervical cancer....,Tweet,2024-06-27 14:27:14,0.0,1.0,6.0,137.0,English,Twitter,,,VaccineNet_NG,Lagos,707.0,,2024-06-27,14:27:14,2024,6,27,14,27,Thursday,English,"=""1806318372742635560""",Platform_ID
3,"=""1806318348721954865""",https://t.co/wirqlPgG1q,Tweet,2024-06-27 14:27:08,0.0,0.0,4.0,46.0,German,Twitter,,,VaccineNet_NG,Well informed,707.0,,2024-06-27,14:27:08,2024,6,27,14,27,Thursday,German,"=""1806318348721954865""",Platform_ID
4,"=""1805543612580606082""",@science_nigeria @nighealthwatch @Fmohnigeria ...,Reply,2024-06-25 11:08:37,0.0,0.0,0.0,5.0,English,Twitter,,,VaccineNet_NG,Nigeria,707.0,,2024-06-25,11:08:37,2024,6,25,11,8,Tuesday,English,"=""1805543612580606082""",Platform_ID


In [187]:
import pandas as pd

def normalize_tweet_id(df, col_name='Tweet ID'):
    """
    Normalize the Tweet ID column to consistent string format.
    Handles:
    - Excel-style quoted IDs: ="12345" -> "12345"
    - Scientific notation floats: 2.77e+18 -> full integer string
    - Already string IDs remain unchanged
    """
    def normalize(value):
        if pd.isna(value):
            return None
        # Convert numeric scientific notation to int string
        if isinstance(value, float):
            return str(int(value))
        # Remove Excel-style ='...' quotes
        val_str = str(value)
        if val_str.startswith('="') and val_str.endswith('"'):
            return val_str[2:-1]
        return val_str
    
    df[col_name] = df[col_name].apply(normalize)
    return df

# Apply to your df_cl
df_cl = normalize_tweet_id(df_cl)

# Quick check
print(df_cl['Tweet ID'].head(10))


0    1807821100576895401
1    1807776904805580853
2    1806318372742635560
3    1806318348721954865
4    1805543612580606082
5    1805540309104771446
6    1804492834000531887
7    1804491510454301127
8    1803394080082190414
9    1802357799202029705
Name: Tweet ID, dtype: object


In [None]:
columns to delete 

	Type, Creation Time, type, geo, Number of Followers, Biography, date, Tweet ID

In [195]:
df_cl.isnull().sum()

Tweet ID               19981
Tweet Text              3076
Type                    2281
Creation Time              0
Reply Count                0
Retweet Count              0
Like Count                 0
View Count                 0
Language                   0
Source                     0
type                   45818
geo                    46397
Username                   0
Location                   0
Number of Followers        0
Biography              33926
date                       0
time                       0
year                       0
month                      0
day                        0
hour                       0
minute                     0
weekday                    0
Language_Full              0
Master_ID                  0
ID_Source                  0
dtype: int64

In [197]:
# List of columns to drop
drop_cols = ['Type', 'Creation Time', 'type', 'geo', 'Number of Followers', 'Biography', 'date', 'Tweet ID']

# Drop columns in-place
df_cl.drop(columns=[col for col in drop_cols if col in df_cl.columns], inplace=True)


In [201]:
df_cl.isnull().sum()

Tweet Text       0
Reply Count      0
Retweet Count    0
Like Count       0
View Count       0
Language         0
Source           0
Username         0
Location         0
time             0
year             0
month            0
day              0
hour             0
minute           0
weekday          0
Language_Full    0
Master_ID        0
ID_Source        0
dtype: int64

In [200]:
#dropping the missing fo the tweet text
df_cl = df_cl.dropna(subset=['Tweet Text'])

In [202]:
#cecking the overall shape of the data
df_cl.shape

(45019, 19)

In [203]:
df_cl.head(2)

Unnamed: 0,Tweet Text,Reply Count,Retweet Count,Like Count,View Count,Language,Source,Username,Location,time,year,month,day,hour,minute,weekday,Language_Full,Master_ID,ID_Source
0,The HPV vaccine is a critical step in preventi...,1.0,0.0,2.0,81.0,English,Twitter,VaccineNet_NG,"Lagos, Nigeria",17:58:32,2024,7,1,17,58,Monday,English,"=""1807821100576895401""",Platform_ID
1,"New month, new experiences, new opportunities,...",0.0,0.0,2.0,50.0,English,Twitter,VaccineNet_NG,Abuja. Nigeria.,15:02:55,2024,7,1,15,2,Monday,English,"=""1807776904805580853""",Platform_ID


In [211]:
# Rename columns with underscores and 'Master_ID' to 'Post_ID'
df_cl.rename(columns={
    'Tweet Text': 'Tweet_Text',
    'Reply Count': 'Reply_Count',
    'Retweet Count': 'Retweet_Count',
    'Like Count': 'Like_Count',
    'View Count': 'View_Count',
    'Master_ID': 'Post_ID'
}, inplace=True)

# Drop unwanted columns
df_cl.drop(columns=['Language_Full', 'time'], inplace=True)


In [212]:
df_cl.head(2)

Unnamed: 0,Tweet_Text,Reply_Count,Retweet_Count,Like_Count,View_Count,Language,Source,Username,Location,year,month,day,hour,minute,weekday,Post_ID,ID_Source
0,The HPV vaccine is a critical step in preventi...,1.0,0.0,2.0,81.0,English,Twitter,VaccineNet_NG,"Lagos, Nigeria",2024,7,1,17,58,Monday,"=""1807821100576895401""",Platform_ID
1,"New month, new experiences, new opportunities,...",0.0,0.0,2.0,50.0,English,Twitter,VaccineNet_NG,Abuja. Nigeria.,2024,7,1,15,2,Monday,"=""1807776904805580853""",Platform_ID


In [217]:
df_cl.duplicated().sum()

np.int64(14)

In [218]:
df_cl = df_cl.drop_duplicates()

In [219]:
df_cl.duplicated().sum()

np.int64(0)

In [220]:
df_cl.shape

(45005, 17)

In [221]:
# Save the cleaned dataframe to a CSV file
df_cl.to_csv("clean_scraped_posts.csv", index=False)

print("Cleaned data saved as 'df_cl_clean.csv'")


Cleaned data saved as 'df_cl_clean.csv'


In [222]:
df_cl.columns

Index(['Tweet_Text', 'Reply_Count', 'Retweet_Count', 'Like_Count',
       'View_Count', 'Language', 'Source', 'Username', 'Location', 'year',
       'month', 'day', 'hour', 'minute', 'weekday', 'Post_ID', 'ID_Source'],
      dtype='object')