#### Data Quality Checks and Cleaning

Steps taken to clean the data:

1. Replacing nan values in Categorical columns of News.csv dataframe with "unknown".
2. Convert dtypes of columns into their correct dtypes.
3. Remove any punctuations, url, emojis or uneccessary words from headline,title.
4. Check for Missing Values in columns especially in Platform Columns.
5. Flag any negative value in Popularity.
6. Replace Empty strings with "unkown".
7. Save the Cleaned file to be used in Analysis and Ml Models.

In [21]:
import pandas as pd
import re
import string
import numpy as np

def clean_dataframe(df):
    """Cleans object-type columns, handles empty strings, and NaNs efficiently.

    Args:
        df: Pandas DataFrame to clean.

    Returns:
        A cleaned Pandas DataFrame.
    """
    df_cleaned = df.copy()

    # Fill NaNs in specified columns
    cols_to_fill_na = ['Source', 'Headline']
    df_cleaned[cols_to_fill_na] = df_cleaned[cols_to_fill_na].fillna('unknown')

    # Convert PublishDate to datetime (handle errors)
    df_cleaned['PublishDate'] = pd.to_datetime(df_cleaned['PublishDate'], errors='coerce')

    # Define emoji pattern (outside the loop for efficiency)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    # Clean text columns using vectorized string methods
    object_cols = df_cleaned.select_dtypes(include=['object']).columns
    for col in object_cols:
        df_cleaned[col] = df_cleaned[col].str.replace(r'http[s]?://\S+', '', regex=True)  # Remove URLs
        df_cleaned[col] = df_cleaned[col].str.replace(emoji_pattern, '', regex=True)  # Remove emojis
        df_cleaned[col] = df_cleaned[col].str.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        df_cleaned[col] = df_cleaned[col].str.replace(r'\s+', ' ', regex=True).str.strip()  # Remove extra whitespace and strip
        df_cleaned[col] = df_cleaned[col].replace("", "Unknown")  # Replace empty strings

    return df_cleaned


def data_quality_check(df):
    """Performs data quality checks."""
    if not isinstance(df, pd.DataFrame):
        print("Input is not a Pandas DataFrame")
        return None

    results = {}
    expected_dtypes = {
        "IDLink": np.int64,
        "Title": object,
        "Headline": object,
        "Source": object,
        "Topic": object,
        "PublishDate": np.dtype('datetime64[ns]'),
        "SentimentTitle": np.float64,
        "SentimentHeadline": np.float64,
        "Facebook": np.float64,
        "GooglePlus": np.float64,
        "LinkedIn": np.float64,
    }
    topic_values = ["obama", "palestine", "microsoft", "economy"]

    results["dtype_errors"] = [
        f"Column '{col}' has dtype {df[col].dtype}, expected {expected_dtype}."
        for col, expected_dtype in expected_dtypes.items()
        if col in df.columns and df[col].dtype != expected_dtype
    ]

    results["missing_columns"] = [col for col in expected_dtypes if col not in df.columns]

    if "Topic" in df.columns:
        invalid_topics = df[~df["Topic"].isin(topic_values)]
        results["topic_errors"] = [f"Found {len(invalid_topics)} rows with invalid topics: {invalid_topics['Topic'].unique()}"] if not invalid_topics.empty else []
    else:
        results["topic_errors"] = ["Column 'Topic' is missing."]

    results["missing_values"] = df.isnull().sum()[df.isnull().sum() > 0].to_dict()
    results["empty_strings"] = {col: df[df[col] == ""].shape[0] for col in df.select_dtypes(include="object")}
    results["negative_social_media"] = {col: df[df[col] < 0].shape[0] for col in ["Facebook", "GooglePlus", "LinkedIn"] if col in df}

    return results

# Example Usage
news = pd.read_csv("Main_News.csv")  # Replace "Main_News.csv" with your file path

cleaned_news = clean_dataframe(news)
quality_results = data_quality_check(cleaned_news)

print("Data Quality Check Results:")
print(quality_results)

cleaned_news.to_csv("Cleaned_News.csv", index=False)
print("Cleaned data saved to Cleaned_News.csv")

Data Quality Check Results:
{'dtype_errors': [], 'missing_columns': [], 'topic_errors': [], 'missing_values': {}, 'empty_strings': {'Title': 0, 'Headline': 0, 'Source': 0, 'Topic': 0}, 'negative_social_media': {'Facebook': 0, 'GooglePlus': 0, 'LinkedIn': 0}}
Cleaned data saved to Cleaned_News.csv
