#### Predicting Popularity Class and Value for an Article in Facebook, GooglePlus, LinkedIn.

## Libraries Used

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from lightgbm import LGBMRegressor




# Classifying articles into High, Low and Average Popularity

In [3]:
# Import the Cleaned data after running the ETL file:
data=pd.read_csv("Cleaned_News.csv")
data['PublishDate'] = pd.to_datetime(data['PublishDate'], errors='coerce')

Creating Columns out of PublishDate and Sentiment Columns:

In [4]:
# Creating new Columns 
def create_derived_features(df):
    """Creates derived features from existing columns.

    Args:
        df: Pandas DataFrame containing news data with 'SentimentTitle',
            'SentimentHeadline', and 'PublishDate' columns.

    Returns:
        A new Pandas DataFrame with the derived features, or the original
        DataFrame if the required columns are missing. Returns None if input is not a dataframe.
    """
    if not isinstance(df, pd.DataFrame):
        print("Input is not a Pandas DataFrame")
        return None
    df_derived = df.copy()

    required_cols = ['SentimentTitle', 'SentimentHeadline', 'PublishDate']
    if not all(col in df_derived.columns for col in required_cols):
        print(f"Missing required columns: {set(required_cols) - set(df_derived.columns)}")
        return df_derived  # Return original if columns are missing

    # 1. Sentiment Mean
    df_derived['Sentiment_mean'] = df_derived[['SentimentTitle', 'SentimentHeadline']].mean(axis=1)

    # 2. Publish Day
    df_derived['PublishDay'] = df_derived['PublishDate'].dt.day_name()

    # 3. Publish Hour
    df_derived['PublishHour'] = df_derived['PublishDate'].dt.hour

    # 4. Time of Day
    def categorize_time(hour):
        if 6 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'

    df_derived['TimeOfDay'] = df_derived['PublishHour'].apply(categorize_time)

    return df_derived

data = create_derived_features(data)



In [5]:
data.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'SentimentTitle', 'SentimentHeadline', 'Facebook', 'GooglePlus',
       'LinkedIn', 'Sentiment_mean', 'PublishDay', 'PublishHour', 'TimeOfDay'],
      dtype='object')

In [6]:
df=data.copy()

We will no create the Target columns that would be based upon the Quartile ranges of our Continous Target Variables

In [7]:
def classify_columns(data):
    for platform in ['Facebook', 'GooglePlus', 'LinkedIn']:
        # Determine the column's quartiles
        q25 = data[platform].quantile(0.25)
        q75 = data[platform].quantile(0.75)
        
        # Create a new column based on the quartile ranges
        data[f'{platform.lower()}_class'] = pd.cut(
            data[platform],
            bins=[-float('inf'), q25, q75, float('inf')],
            labels=[0, 1, 2],
            include_lowest=True
        ).astype(int)
    
    return data

df = classify_columns(data)
df.head()


Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn,Sentiment_mean,PublishDay,PublishHour,TimeOfDay,facebook_class,googleplus_class,linkedin_class
0,99248,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,0.0,-0.0533,2547.659722,1538.570833,499.025,-0.02665,Tuesday,0,Night,2,1,0
1,10423,A Look at the Health of the Chinese Economy,Tim Haywood investment director businessunit h...,Bloomberg,economy,2008-09-20 00:00:00,0.208333,-0.156386,1380.145833,1957.444444,753.729167,0.025974,Saturday,0,Night,1,1,1
2,18828,Nouriel Roubini Global Economy Not Back to 2008,Nouriel Roubini NYU professor and chairman at ...,Bloomberg,economy,2012-01-28 00:00:00,-0.42521,0.139754,1647.295833,2242.472222,874.993056,-0.142728,Saturday,0,Night,1,2,1
3,27788,Finland GDP Expands In Q4,Finlands economy expanded marginally in the th...,RTT News,economy,2015-03-01 00:06:00,0.0,0.026064,1157.554167,1805.383333,701.736111,0.013032,Sunday,0,Night,0,1,1
4,27789,Tourism govt spending buoys Thai economy in Ja...,Tourism and public spending continued to boost...,The Nation Thailand39s English news,economy,2015-03-01 00:11:00,0.0,0.141084,1439.5125,2166.45,857.6875,0.070542,Sunday,0,Night,1,2,1


In [8]:
# Dropping the Continous target Variables for sound model:
df=df.drop(["Facebook","GooglePlus","LinkedIn"],axis=1)
df.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'SentimentTitle', 'SentimentHeadline', 'Sentiment_mean', 'PublishDay',
       'PublishHour', 'TimeOfDay', 'facebook_class', 'googleplus_class',
       'linkedin_class'],
      dtype='object')

## Random Forest Model For Facebook_Class:

In [9]:
# Function to preprocess data
def preprocess_data(df):
    """
    Preprocesses the data for modeling.
    - Encodes categorical variables.
    - Scales numerical features.
    - Encodes the 'Source' column using frequency encoding.
    """
    # Work on a copy of the DataFrame to avoid altering the original
    df_copy = df.copy()

    # Frequency encoding for 'Source'
    source_freq = df_copy['Source'].value_counts(normalize=True)  # Normalize for proportion-based encoding
    df_copy['Source_freq'] = df_copy['Source'].map(source_freq)
    df_copy.drop(columns=['Source'], inplace=True)  # Drop original Source column

    # One-hot encode other categorical features
    categorical_cols = ['Topic', 'PublishDay', 'TimeOfDay']
    df_encoded = pd.get_dummies(df_copy, columns=categorical_cols, drop_first=True)

    # Scaling numerical columns
    numerical_cols = ['Sentiment_mean']
    scaler = StandardScaler()
    df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

    return df_encoded


# Function to split data
def split_data(df, target_col):
    """
    Splits the data into train and test sets.
    - df: DataFrame
    - target_col: Target column name
    """
    X = df.drop(columns=[target_col, 'IDLink', 'Title', 'Headline', "PublishDate", "googleplus_class", "linkedin_class","SentimentHeadline","SentimentTitle"])
    y = df[target_col]
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Function to train a model
def train_model(X_train, y_train):
    """
    Trains a Random Forest model.
    - Returns the trained model.
    """
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model


# Function to evaluate a model and print feature importance
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model on the test set and prints feature importance.
    - Prints classification report, accuracy, and top features.
    """
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

    # Print feature importances (if the model has a feature_importances_ attribute)
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        feature_names = list(X_test.columns)
        sorted_indices = np.argsort(importances)[::-1]

        # Print top 10 most important features
        print("\nTop 10 Most Important Features:")
        for i, idx in enumerate(sorted_indices[:10]):
            print(f"{i+1}. {feature_names[idx]} ({importances[idx]:.4f})")


# Main function to process the pipeline
def main_pipeline(df, target_col):
    """
    End-to-end pipeline for preprocessing, training, and evaluation.
    - df: DataFrame
    - target_col: Target column for prediction
    """
    print(f"\nProcessing for {target_col}...")
    df_preprocessed = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(df_preprocessed, target_col)
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    return model


# Call pipeline for each platform (assuming your data has columns for facebook, googleplus, linkedin classification)
facebook_model = main_pipeline(df, 'facebook_class')


Processing for facebook_class...
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.81      4664
           1       0.83      0.86      0.85      9322
           2       0.88      0.88      0.88      4662

    accuracy                           0.84     18648
   macro avg       0.85      0.84      0.84     18648
weighted avg       0.84      0.84      0.84     18648

Accuracy Score: 0.8442192192192193

Top 10 Most Important Features:
1. Sentiment_mean (0.2911)
2. Topic_obama (0.2621)
3. Source_freq (0.1953)
4. PublishHour (0.1152)
5. Topic_palestine (0.0521)
6. Topic_microsoft (0.0263)
7. TimeOfDay_Night (0.0138)
8. PublishDay_Sunday (0.0099)
9. TimeOfDay_Morning (0.0090)
10. PublishDay_Saturday (0.0053)


The outputs provide an evaluation of the classifier's performance on the `facebook_class` target variable, with the following metrics:

---

### **1. Classification Report**

This report includes metrics for each class (0, 1, and 2) in the target variable:

#### **Precision**

-   Precision measures how many of the predicted positive instances for a class are actually correct.
-   Formula: Precision = True Positives / (True Positives + False Positives)
-   High precision indicates that the model has low false positive rates.

-   **Class 0**: \( 0.83 \) (83% of samples predicted as class 0 are actually class 0.)
-   **Class 1**: \( 0.83 \) (83% of samples predicted as class 1 are actually class 1.)
-   **Class 2**: \( 0.88 \) (88% of samples predicted as class 2 are actually class 2.)

#### **Recall**

-   Recall measures how many of the actual positive instances for a class the model correctly identified.
-   Formula: Recall = True Positives / (False Negatives + True Positives)
-   High recall indicates that the model has low false negative rates.

-   **Class 0**: \( 0.78 \) (78% of actual class 0 samples were correctly identified.)
-   **Class 1**: \( 0.86 \) (86% of actual class 1 samples were correctly identified.)
-   **Class 2**: \( 0.88 \) (88% of actual class 2 samples were correctly identified.)

#### **F1-Score**

-   F1-Score is the harmonic mean of precision and recall, providing a single score that balances both metrics.
-   Formula: F1-Score = 2 \* ((Precision \* Recall) / (Precision + Recall))
-   A high F1-Score indicates the model is performing well in both precision and recall.

-   **Class 0**: \( 0.81 \)
-   **Class 1**: \( 0.85 \)
-   **Class 2**: \( 0.88 \)

#### **Support**

-   Support refers to the number of samples in each class in the test dataset.
-   **Class 0**: \( 4664 \)
-   **Class 1**: \( 9322 \)
-   **Class 2**: \( 4662 \)

---

### **2. Overall Metrics**

#### **Accuracy**

-   Accuracy is the ratio of correctly predicted samples to the total number of samples.
-   **Result**: \( 0.844 \) (The model correctly predicted 84.4% of the samples.)

#### **Macro Average**

-   The unweighted mean of precision, recall, and F1-Score across all classes.
-   Treats all classes equally, regardless of their support.

-   **Macro Precision**: \( 0.85 \)
-   **Macro Recall**: \( 0.84 \)
-   **Macro F1-Score**: \( 0.84 \)

#### **Weighted Average**

-   The weighted mean of precision, recall, and F1-Score across all classes.
-   Weights each class’s metric by its support, giving more importance to classes with more samples.

-   **Weighted Precision**: \( 0.84 \)
-   **Weighted Recall**: \( 0.84 \)
-   **Weighted F1-Score**: \( 0.84 \)

---

### **Key Takeaways**

1.  **Good Performance**: The model has a good overall accuracy of 84.4% and reasonable F1-Scores for all classes, particularly strong for class 2.
2.  **Relatively Balanced Metrics**: The precision and recall scores are reasonably balanced across the classes, suggesting that the model is not significantly biased towards any particular class.
3.  **Class Distribution Impact**: The weighted average scores are very close to the macro averages, indicating that the class distribution is relatively balanced and does not significantly skew the overall performance metrics.

## Random Forest Model for GooglePlus_Class:

In [10]:
# Function to preprocess data
def preprocess_data(df):
    """
    Preprocesses the data for modeling.
    - Encodes categorical variables.
    - Scales numerical features.
    - Encodes the 'Source' column using frequency encoding.
    """
    # Work on a copy of the DataFrame to avoid altering the original
    df_copy = df.copy()

    # Frequency encoding for 'Source'
    source_freq = df_copy['Source'].value_counts(normalize=True)  # Normalize for proportion-based encoding
    df_copy['Source_freq'] = df_copy['Source'].map(source_freq)
    df_copy.drop(columns=['Source'], inplace=True)  # Drop original Source column

    # One-hot encode other categorical features
    categorical_cols = ['Topic', 'PublishDay', 'TimeOfDay']
    df_encoded = pd.get_dummies(df_copy, columns=categorical_cols, drop_first=True)

    # Scaling numerical columns
    numerical_cols = ['Sentiment_mean']
    scaler = StandardScaler()
    df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

    return df_encoded


# Function to split data
def split_data(df, target_col):
    """
    Splits the data into train and test sets.
    - df: DataFrame
    - target_col: Target column name
    """
    X = df.drop(columns=[target_col, 'IDLink', 'Title', 'Headline', "PublishDate", "facebook_class", "linkedin_class","SentimentHeadline","SentimentTitle"])
    y = df[target_col]
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Function to train a model
def train_model(X_train, y_train):
    """
    Trains a Random Forest model.
    - Returns the trained model.
    """
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model


# Function to evaluate a model and print feature importance
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model on the test set and prints feature importance.
    - Prints classification report, accuracy, and top features.
    """
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

    # Print feature importances (if the model has a feature_importances_ attribute)
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        feature_names = list(X_test.columns)
        sorted_indices = np.argsort(importances)[::-1]

        # Print top 10 most important features
        print("\nTop 10 Most Important Features:")
        for i, idx in enumerate(sorted_indices[:10]):
            print(f"{i+1}. {feature_names[idx]} ({importances[idx]:.4f})")


# Main function to process the pipeline
def main_pipeline(df, target_col):
    """
    End-to-end pipeline for preprocessing, training, and evaluation.
    - df: DataFrame
    - target_col: Target column for prediction
    """
    print(f"\nProcessing for {target_col}...")
    df_preprocessed = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(df_preprocessed, target_col)
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    return model
googleplus_model = main_pipeline(df, 'googleplus_class')



Processing for googleplus_class...
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      4663
           1       0.80      0.82      0.81      9323
           2       0.83      0.82      0.83      4662

    accuracy                           0.81     18648
   macro avg       0.81      0.80      0.80     18648
weighted avg       0.81      0.81      0.81     18648

Accuracy Score: 0.8052874302874303

Top 10 Most Important Features:
1. PublishHour (0.2598)
2. Sentiment_mean (0.2457)
3. Source_freq (0.1786)
4. Topic_obama (0.1348)
5. Topic_microsoft (0.0659)
6. Topic_palestine (0.0587)
7. TimeOfDay_Night (0.0171)
8. TimeOfDay_Morning (0.0107)
9. TimeOfDay_Evening (0.0082)
10. PublishDay_Tuesday (0.0038)


The outputs provide an evaluation of the classifier's performance on the `googleplus_class` target variable, with the following metrics:

---
### GooglePlus Classification Model Evaluation

The following metrics summarize the performance of the GooglePlus classification model on the test dataset:

---

### **1. Classification Report**

This report includes metrics for each class (0, 1, and 2) in the target variable:

#### **Precision**

-   Precision measures how many of the predicted positive instances for a class are actually correct.
-   Formula: Precision = True Positives / (True Positives + False Positives)
-   High precision indicates that the model has low false positive rates.

-   **Class 0**: \( 0.80 \) (80% of samples predicted as class 0 are actually class 0.)
-   **Class 1**: \( 0.80 \) (80% of samples predicted as class 1 are actually class 1.)
-   **Class 2**: \( 0.83 \) (83% of samples predicted as class 2 are actually class 2.)

#### **Recall**

-   Recall measures how many of the actual positive instances for a class the model correctly identified.
-   Formula: Recall = True Positives / (False Negatives + True Positives)
-   High recall indicates that the model has low false negative rates.

-   **Class 0**: \( 0.76 \) (76% of actual class 0 samples were correctly identified.)
-   **Class 1**: \( 0.82 \) (82% of actual class 1 samples were correctly identified.)
-   **Class 2**: \( 0.82 \) (82% of actual class 2 samples were correctly identified.)

#### **F1-Score**

-   F1-Score is the harmonic mean of precision and recall, providing a single score that balances both metrics.
-   Formula: F1-Score = 2 \* ((Precision \* Recall) / (Precision + Recall))
-   A high F1-Score indicates the model is performing well in both precision and recall.

-   **Class 0**: \( 0.78 \)
-   **Class 1**: \( 0.81 \)
-   **Class 2**: \( 0.83 \)

#### **Support**

-   Support refers to the number of samples in each class in the test dataset.
-   **Class 0**: \( 4663 \)
-   **Class 1**: \( 9323 \)
-   **Class 2**: \( 4662 \)

---

### **2. Overall Metrics**

#### **Accuracy**

-   Accuracy is the ratio of correctly predicted samples to the total number of samples.
-   **Result**: \( 0.805 \) (The model correctly predicted 80.5% of the samples.)

#### **Macro Average**

-   The unweighted mean of precision, recall, and F1-Score across all classes.
-   Treats all classes equally, regardless of their support.

-   **Macro Precision**: \( 0.81 \)
-   **Macro Recall**: \( 0.80 \)
-   **Macro F1-Score**: \( 0.80 \)

#### **Weighted Average**

-   The weighted mean of precision, recall, and F1-Score across all classes.
-   Weights each class’s metric by its support, giving more importance to classes with more samples.

-   **Weighted Precision**: \( 0.81 \)
-   **Weighted Recall**: \( 0.81 \)
-   **Weighted F1-Score**: \( 0.81 \)

---

### **3. Feature Importance**

The following table shows the top 10 most important features identified by the Random Forest model for predicting `googleplus_class`:

| Rank | Feature | Importance |
|---|---|---|
| 1 | PublishHour | 0.2598 |
| 2 | Sentiment\_mean | 0.2457 |
| 3 | Source\_freq | 0.1786 |
| 4 | Topic\_obama | 0.1348 |
| 5 | Topic\_microsoft | 0.0659 |
| 6 | Topic\_palestine | 0.0587 |
| 7 | TimeOfDay\_Night | 0.0171 |
| 8 | TimeOfDay\_Morning | 0.0107 |
| 9 | TimeOfDay\_Evening | 0.0082 |
| 10 | PublishDay\_Tuesday | 0.0038 |

These values indicate the relative importance of each feature in the model's decision-making process. `PublishHour`, `Sentiment_mean`, and `Source_freq` are the most influential features.

---

### **Key Takeaways**

1.  **Reasonable Performance**: The model achieves an overall accuracy of approximately 80.5%.
2.  **Feature Importance**: `PublishHour`, `Sentiment_mean`, and `Source_freq` are the most important features for predicting GooglePlus class. This suggests that the time of day the article was published, the overall sentiment, and the frequency of the source are strong indicators of GooglePlus engagement.
3. The model performance is slightly lower than the facebook model.

## Random Forest Model for LinkedIn_Class:

In [11]:
# Function to preprocess data
def preprocess_data(df):
    """
    Preprocesses the data for modeling.
    - Encodes categorical variables.
    - Scales numerical features.
    - Encodes the 'Source' column using frequency encoding.
    """
    # Work on a copy of the DataFrame to avoid altering the original
    df_copy = df.copy()

    # Frequency encoding for 'Source'
    source_freq = df_copy['Source'].value_counts(normalize=True)  # Normalize for proportion-based encoding
    df_copy['Source_freq'] = df_copy['Source'].map(source_freq)
    df_copy.drop(columns=['Source'], inplace=True)  # Drop original Source column

    # One-hot encode other categorical features
    categorical_cols = ['Topic', 'PublishDay', 'TimeOfDay']
    df_encoded = pd.get_dummies(df_copy, columns=categorical_cols, drop_first=True)

    # Scaling numerical columns
    numerical_cols = ['Sentiment_mean']
    scaler = StandardScaler()
    df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

    return df_encoded


# Function to split data
def split_data(df, target_col):
    """
    Splits the data into train and test sets.
    - df: DataFrame
    - target_col: Target column name
    """
    X = df.drop(columns=[target_col, 'IDLink', 'Title', 'Headline', "PublishDate", "googleplus_class", "facebook_class","SentimentHeadline","SentimentTitle"])
    y = df[target_col]
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Function to train a model
def train_model(X_train, y_train):
    """
    Trains a Random Forest model.
    - Returns the trained model.
    """
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model


# Function to evaluate a model and print feature importance
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model on the test set and prints feature importance.
    - Prints classification report, accuracy, and top features.
    """
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

    # Print feature importances (if the model has a feature_importances_ attribute)
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        feature_names = list(X_test.columns)
        sorted_indices = np.argsort(importances)[::-1]

        # Print top 10 most important features
        print("\nTop 10 Most Important Features:")
        for i, idx in enumerate(sorted_indices[:10]):
            print(f"{i+1}. {feature_names[idx]} ({importances[idx]:.4f})")


# Main function to process the pipeline
def main_pipeline(df, target_col):
    """
    End-to-end pipeline for preprocessing, training, and evaluation.
    - df: DataFrame
    - target_col: Target column for prediction
    """
    print(f"\nProcessing for {target_col}...")
    df_preprocessed = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(df_preprocessed, target_col)
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    return model
linkedin_model = main_pipeline(df, 'linkedin_class')



Processing for linkedin_class...
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      4662
           1       0.83      0.85      0.84      9326
           2       0.77      0.75      0.76      4660

    accuracy                           0.84     18648
   macro avg       0.84      0.84      0.84     18648
weighted avg       0.84      0.84      0.84     18648

Accuracy Score: 0.8384813384813384

Top 10 Most Important Features:
1. Topic_obama (0.3493)
2. Sentiment_mean (0.2104)
3. PublishHour (0.1581)
4. Source_freq (0.1376)
5. TimeOfDay_Morning (0.0747)
6. Topic_microsoft (0.0255)
7. Topic_palestine (0.0102)
8. TimeOfDay_Night (0.0097)
9. TimeOfDay_Evening (0.0058)
10. PublishDay_Thursday (0.0035)


The outputs provide an evaluation of the classifier's performance on the `linkedin_class` target variable, with the following metrics:

---

### LinkedIn Classification Model Evaluation

The following metrics summarize the performance of the LinkedIn classification model on the test dataset:

---

### **1. Classification Report**

This report includes metrics for each class (0, 1, and 2) in the target variable:

#### **Precision**

-   Precision measures how many of the predicted positive instances for a class are actually correct.
-   Formula: Precision = True Positives / (True Positives + False Positives)
-   High precision indicates that the model has low false positive rates.

-   **Class 0**: \( 0.92 \) (92% of samples predicted as class 0 are actually class 0.)
-   **Class 1**: \( 0.83 \) (83% of samples predicted as class 1 are actually class 1.)
-   **Class 2**: \( 0.77 \) (77% of samples predicted as class 2 are actually class 2.)

#### **Recall**

-   Recall measures how many of the actual positive instances for a class the model correctly identified.
-   Formula: Recall = True Positives / (False Negatives + True Positives)
-   High recall indicates that the model has low false negative rates.

-   **Class 0**: \( 0.91 \) (91% of actual class 0 samples were correctly identified.)
-   **Class 1**: \( 0.85 \) (85% of actual class 1 samples were correctly identified.)
-   **Class 2**: \( 0.75 \) (75% of actual class 2 samples were correctly identified.)

#### **F1-Score**

-   F1-Score is the harmonic mean of precision and recall, providing a single score that balances both metrics.
-   Formula: F1-Score = 2 \* ((Precision \* Recall) / (Precision + Recall))
-   A high F1-Score indicates the model is performing well in both precision and recall.

-   **Class 0**: \( 0.91 \)
-   **Class 1**: \( 0.84 \)
-   **Class 2**: \( 0.76 \)

#### **Support**

-   Support refers to the number of samples in each class in the test dataset.
-   **Class 0**: \( 4662 \)
-   **Class 1**: \( 9326 \)
-   **Class 2**: \( 4660 \)

---

### **2. Overall Metrics**

#### **Accuracy**

-   Accuracy is the ratio of correctly predicted samples to the total number of samples.
-   **Result**: \( 0.838 \) (The model correctly predicted 83.8% of the samples.)

#### **Macro Average**

-   The unweighted mean of precision, recall, and F1-Score across all classes.
-   Treats all classes equally, regardless of their support.

-   **Macro Precision**: \( 0.84 \)
-   **Macro Recall**: \( 0.84 \)
-   **Macro F1-Score**: \( 0.84 \)

#### **Weighted Average**

-   The weighted mean of precision, recall, and F1-Score across all classes.
-   Weights each class’s metric by its support, giving more importance to classes with more samples.

-   **Weighted Precision**: \( 0.84 \)
-   **Weighted Recall**: \( 0.84 \)
-   **Weighted F1-Score**: \( 0.84 \)

---

### **3. Feature Importance**

The following table shows the top 10 most important features identified by the Random Forest model for predicting `linkedin_class`:

| Rank | Feature | Importance |
|---|---|---|
| 1 | Topic\_obama | 0.3493 |
| 2 | Sentiment\_mean | 0.2104 |
| 3 | PublishHour | 0.1581 |
| 4 | Source\_freq | 0.1376 |
| 5 | TimeOfDay\_Morning | 0.0747 |
| 6 | Topic\_microsoft | 0.0255 |
| 7 | Topic\_palestine | 0.0102 |
| 8 | TimeOfDay\_Night | 0.0097 |
| 9 | TimeOfDay\_Evening | 0.0058 |
| 10 | PublishDay\_Thursday | 0.0035 |

These values indicate the relative importance of each feature in the model's decision-making process. `Topic_obama`, `Sentiment_mean`, and `PublishHour` are the most influential features for predicting LinkedIn engagement.

---

### **Key Takeaways**

1.  **Reasonable Performance**: The model achieves an overall accuracy of approximately 83.8%.
2.  **Feature Importance**: `Topic_obama` is by far the most important feature for predicting LinkedIn class, followed by `Sentiment_mean` and `PublishHour`. This suggests that the topic of the article, especially if it's about Obama, the overall sentiment, and the time of day are strong indicators of LinkedIn engagement.
3. The model performs best on class 0 (LOW Popularity Articles).

# Predicting Popularity Continous Range

## Transforming Columns:

In [12]:
# Cleaned Data after passing through ETL File:
data=pd.read_csv("Cleaned_News.csv")

# Converting PublishDate into datetime[ns]
data['PublishDate'] = pd.to_datetime(data['PublishDate'], errors='coerce')

# Creating new Columns same as above :
def create_derived_features(df):
    """Creates derived features from existing columns.

    Args:
        df: Pandas DataFrame containing news data with 'SentimentTitle',
            'SentimentHeadline', and 'PublishDate' columns.

    Returns:
        A new Pandas DataFrame with the derived features, or the original
        DataFrame if the required columns are missing. Returns None if input is not a dataframe.
    """
    if not isinstance(df, pd.DataFrame):
        print("Input is not a Pandas DataFrame")
        return None
    df_derived = df.copy()

    required_cols = ['SentimentTitle', 'SentimentHeadline', 'PublishDate']
    if not all(col in df_derived.columns for col in required_cols):
        print(f"Missing required columns: {set(required_cols) - set(df_derived.columns)}")
        return df_derived  # Return original if columns are missing

    # 1. Sentiment Mean
    df_derived['Sentiment_mean'] = df_derived[['SentimentTitle', 'SentimentHeadline']].mean(axis=1)

    # 2. Publish Day
    df_derived['PublishDay'] = df_derived['PublishDate'].dt.day_name()

    # 3. Publish Hour
    df_derived['PublishHour'] = df_derived['PublishDate'].dt.hour

    # 4. Time of Day
    def categorize_time(hour):
        if 6 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'

    df_derived['TimeOfDay'] = df_derived['PublishHour'].apply(categorize_time)

    return df_derived

data = create_derived_features(data)




In [13]:
data.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'SentimentTitle', 'SentimentHeadline', 'Facebook', 'GooglePlus',
       'LinkedIn', 'Sentiment_mean', 'PublishDay', 'PublishHour', 'TimeOfDay'],
      dtype='object')

In [14]:
data.dtypes

IDLink                        int64
Title                        object
Headline                     object
Source                       object
Topic                        object
PublishDate          datetime64[ns]
SentimentTitle              float64
SentimentHeadline           float64
Facebook                    float64
GooglePlus                  float64
LinkedIn                    float64
Sentiment_mean              float64
PublishDay                   object
PublishHour                   int32
TimeOfDay                    object
dtype: object

In [15]:
data.head(2)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn,Sentiment_mean,PublishDay,PublishHour,TimeOfDay
0,99248,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02,0.0,-0.0533,2547.659722,1538.570833,499.025,-0.02665,Tuesday,0,Night
1,10423,A Look at the Health of the Chinese Economy,Tim Haywood investment director businessunit h...,Bloomberg,economy,2008-09-20,0.208333,-0.156386,1380.145833,1957.444444,753.729167,0.025974,Saturday,0,Night


## Feature Engineering:

Absolutely, here's a breakdown of what each feature in the function does:

**1. Target Variables (lines 5-7):**
  - `df['Facebook'] = np.log1p(df['Facebook'] + 1)`: Applies a logarithmic transformation (log1p) to the "Facebook" engagement column. This helps normalize the skewed distribution of social media engagement data.

**2. Feature Engineering - Top Sources (lines 9-22):**
  - This section identifies the top 10 sources for each topic based on their counts.
  - New features are created indicating if a post comes from one of those top sources for a specific topic (e.g., "source_is_Politics_NewYorkTimes").

**3. Feature Engineering - One-Hot Encoding (lines 24-32):**
  - Categorical features like "Topic", "PublishDay", and "TimeOfDay" are converted into one-hot encoded features. This allows the model to learn the relationship between these categories and the target variable.

**4. Feature Engineering - Sentiment Scaling (lines 34-35):**
  - The "Sentiment_mean" feature is standardized using a StandardScaler. This ensures all features are on a similar scale for better model performance.

**5. Feature Engineering - Weekend Flag (lines 37-38):**
  - A new feature "is_weekend" is created to indicate if the post was published on a Saturday or Sunday.

**6. Text Preprocessing (lines 39-52):**
  - This function defines how text features ("Headline" and "Title") are cleaned:
      - Converts text to lowercase.
      - Removes non-alphanumeric characters.
      - Removes stop words (common words like "the", "a", "an").
      - Returns a space-separated string of remaining words.

**7. Feature Engineering - Text Vectorization (lines 54-70):**
  - Two CountVectorizers are used:
      - One for 5-grams in the "CleanedHeadline" (maximum 20 features).
      - Another for 3-grams in the "CleanedTitle" (maximum 20 features).
  - This creates new features representing the frequency of n-gram word patterns in the headlines and titles.

**8. Feature Selection (lines 72-77):**
  - Selects all features containing "is_", "PublishDay_", "TimeOfDay_", "Sentiment_mean_scaled", "source_is_", all features from the CountVectorizers, and "is_weekend".
  - Removes any features with constant values across all samples.

**9. Splitting Data (lines 79-80):**
  - Returns two DataFrames:
      - `X`: Contains the preprocessed features used for model training.
      - `df[['Facebook', 'GooglePlus', 'LinkedIn']]`: Contains the original target variables (engagement) for each platform.


In [16]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_data(df):
    """Preprocesses the data for social media engagement prediction."""

    df['Facebook'] = np.log1p(df['Facebook'] + 1)
    df['GooglePlus'] = np.log1p(df['GooglePlus'] + 1)
    df['LinkedIn'] = np.log1p(df['LinkedIn'] + 1)

    source_counts = df.groupby(['Topic', 'Source'])['Source'].count().unstack(fill_value=0)
    top_10_sources = {}
    for topic in df['Topic'].unique():
        top_10_sources[topic] = source_counts.loc[topic].nlargest(10).index.tolist()

    for topic, sources in top_10_sources.items():
        for source in sources:
            source_col = f"source_is_{topic}_{source.replace(' ', '_')}"
            df[source_col] = (df['Topic'] == topic) & (df['Source'] == source)
            
    categorical_features = ['Topic', 'PublishDay', 'TimeOfDay']
    for feature in categorical_features:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded = ohe.fit_transform(df[[feature]])
        encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out([feature]))
        df = pd.concat([df, encoded_df], axis=1)
        df.drop(feature, axis=1, inplace=True)

    scaler = StandardScaler()
    df['Sentiment_mean_scaled'] = scaler.fit_transform(df[['Sentiment_mean']].values)

    df['is_weekend'] = (df['PublishDay_Saturday'] == 1) | (df['PublishDay_Sunday'] == 1)

    def preprocess_text(text):
        if not isinstance(text, str):
            return ""  # Return empty string instead of list for CountVectorizer
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = [word for word in text.split() if word not in stop_words]
        return " ".join(tokens)  # Return space-separated string

    df['CleanedHeadline'] = df['Headline'].apply(preprocess_text)
    df['CleanedTitle'] = df['Title'].apply(preprocess_text)

    headline_vectorizer = CountVectorizer(max_features=20, ngram_range=(5, 5))
    headline_patterns = headline_vectorizer.fit_transform(df['CleanedHeadline']).toarray()
    df = pd.concat([df, pd.DataFrame(headline_patterns, columns=headline_vectorizer.get_feature_names_out())], axis=1)

    title_vectorizer = CountVectorizer(max_features=20, ngram_range=(3, 3))
    title_patterns = title_vectorizer.fit_transform(df['CleanedTitle']).toarray()
    df = pd.concat([df, pd.DataFrame(title_patterns, columns=title_vectorizer.get_feature_names_out())], axis=1)

    feature_cols = list(df.filter(like='is_')) + list(df.filter(like='PublishDay_')) + list(df.filter(like='TimeOfDay_')) + ['Sentiment_mean_scaled'] + list(df.filter(like='source_is_')) + list(headline_vectorizer.get_feature_names_out()) + list(title_vectorizer.get_feature_names_out()) + ['is_weekend']
    X = df[feature_cols]
    X = X.loc[:, (X != X.iloc[0]).any()]

    return X, df[['Facebook', 'GooglePlus', 'LinkedIn']]

## 1. BaseLine Model:

In [17]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def train_and_evaluate(X, y, target_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_test_original = np.expm1(y_test)
    y_pred_original = np.expm1(y_pred)

    mse = mean_squared_error(y_test_original, y_pred_original)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_original, y_pred_original)

    print(f"Results for {target_name}:")
    print(f"  MSE: {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R-squared: {r2:.2f}")
    print("-" * 30)
    return {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}

# Main execution
news = data.copy() #data is your dataframe
X, y = preprocess_data(news)

results = {}
for target in y.columns:  # Iterate through all target columns
    results[target] = train_and_evaluate(X, y[target], target)

# Print summary of all results (optional)
print("Summary of all results:")
for target, metrics in results.items():
    print(f"{target}: R-squared = {metrics['R-squared']:.2f}")

Results for Facebook:
  MSE: 263891.36
  RMSE: 513.70
  R-squared: 0.27
------------------------------
Results for GooglePlus:
  MSE: 192194.94
  RMSE: 438.40
  R-squared: 0.24
------------------------------
Results for LinkedIn:
  MSE: 34803.57
  RMSE: 186.56
  R-squared: 0.35
------------------------------
Summary of all results:
Facebook: R-squared = 0.27
GooglePlus: R-squared = 0.24
LinkedIn: R-squared = 0.35


**Model Evaluation Results:**

The following metrics evaluate the performance of the linear regression models for predicting social media engagement on Facebook, GooglePlus, and LinkedIn.

*   **Mean Squared Error (MSE):** Average of squared prediction errors. MSE = (1/n) * Σ(yᵢ - ŷᵢ)²
*   **Root Mean Squared Error (RMSE):** Square root of MSE, in the same units as the target. RMSE = sqrt(MSE)
*   **R-squared (R²):** Proportion of variance in the target explained by the model. R² = 1 - (SSres / SStot)

**Facebook:**

*   **MSE: 263891.36:** The average squared prediction error is 263891.36.
*   **RMSE: 513.70:** On average, predictions deviate from actual Facebook share counts by approximately 513.70.
*   **R-squared: 0.27:** The model explains 27% of the variance in Facebook shares, indicating a weak fit.

**GooglePlus:**

*   **MSE: 192194.94:** The average squared prediction error is 192194.94.
*   **RMSE: 438.40:** On average, predictions deviate from actual GooglePlus share counts by approximately 438.40.
*   **R-squared: 0.24:** The model explains 24% of the variance in GooglePlus shares, also indicating a weak fit.

**LinkedIn:**

*   **MSE: 34803.57:** The average squared prediction error is 34803.57.
*   **RMSE: 186.56:** On average, predictions deviate from actual LinkedIn share counts by approximately 186.56.
*   **R-squared: 0.35:** The model explains 35% of the variance in LinkedIn shares, representing the relatively best fit among the three, but still not a strong fit.

**Summary:**

The low R-squared values across all three platforms suggest that a linear model, with the current set of features, does not adequately capture the complex dynamics of social media engagement. Other models or additional features might be more appropriate. Therefore we will use this simple model as the baseline model.

## 2. Random Forest:

In [20]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ... (preprocess_data function remains the same)

def train_and_evaluate_rf(X, y, target_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y[target_name], test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    best_model = None
    best_rmse = float('inf')
    for n_estimators in range(10, 101, 10):
        rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1)
        rf_model.fit(X_train_scaled, y_train)
        y_pred = rf_model.predict(X_test_scaled)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        if rmse < best_rmse:
            best_model = rf_model
            best_rmse = rmse

    try:
        y_pred = best_model.predict(X_test_scaled)
        # Correct inverse transform: Apply expm1 to predictions and *test* values
        y_pred_original = np.expm1(y_pred)
        y_test_original = np.expm1(y_test)
        
        mse = mean_squared_error(y_test_original, y_pred_original)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test_original, y_pred_original)
        return {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}
    except Exception as e:
        print(f"Error during prediction or evaluation for {target_name}: {e}")
        return None

# Main execution
news = data.copy()
X, y = preprocess_data(news)
targets = y.columns
results = {}

for target in targets:
    metrics = train_and_evaluate_rf(X, y, target)
    if metrics is not None:
        results[target] = metrics
        print(f"Results for {target} (Random Forest):")
        print(f"  MSE: {metrics['MSE']:.2f}")
        print(f"  RMSE: {metrics['RMSE']:.2f}")
        print(f"  R-squared: {metrics['R-squared']:.2f}")
        print("-" * 30)

Results for Facebook (Random Forest):
  MSE: 282531.20
  RMSE: 531.54
  R-squared: 0.22
------------------------------
Results for GooglePlus (Random Forest):
  MSE: 218617.72
  RMSE: 467.57
  R-squared: 0.14
------------------------------
Results for LinkedIn (Random Forest):
  MSE: 42194.76
  RMSE: 205.41
  R-squared: 0.21
------------------------------


**Model Evaluation Results (Random Forest):**

The following metrics evaluate the performance of the Random Forest models for predicting social media engagement on Facebook, GooglePlus, and LinkedIn.

*   **Mean Squared Error (MSE):** Average of squared prediction errors. MSE = (1/n) * Σ(yᵢ - ŷᵢ)²
*   **Root Mean Squared Error (RMSE):** Square root of MSE, in the same units as the target. RMSE = sqrt(MSE)
*   **R-squared (R²):** Proportion of variance in the target explained by the model. R² = 1 - (SSres / SStot)

**Facebook:**

*   **MSE: 282531.20:** The average squared prediction error is 282531.20. This indicates a relatively high average squared difference between the predicted and actual Facebook engagement counts.
*   **RMSE: 531.54:** On average, predictions deviate from actual Facebook engagement counts by approximately 531.54. This gives a more interpretable measure of the prediction error in the original units of the target variable.
*   **R-squared: 0.22:** The model explains 22% of the variance in Facebook engagement, indicating a weak fit. This suggests that the Random Forest model, with the current features, does not capture a large portion of the factors influencing Facebook engagement.

**GooglePlus:**

*   **MSE: 218617.72:** The average squared prediction error is 218617.72. Similar to Facebook, this value suggests a considerable average squared difference between predictions and actual GooglePlus engagement.
*   **RMSE: 467.57:** On average, predictions deviate from actual GooglePlus engagement counts by approximately 467.57.
*   **R-squared: 0.14:** The model explains only 14% of the variance in GooglePlus engagement, indicating a very weak fit. This model performs the worst among the three platforms in terms of explained variance.

**LinkedIn:**

*   **MSE: 42194.76:** The average squared prediction error is 42194.76. This is the lowest MSE among the three platforms, suggesting better prediction accuracy for LinkedIn compared to Facebook and GooglePlus.
*   **RMSE: 205.41:** On average, predictions deviate from actual LinkedIn engagement counts by approximately 205.41.
*   **R-squared: 0.21:** The model explains 21% of the variance in LinkedIn engagement, indicating a weak fit, although slightly better than GooglePlus but similar to Facebook.

**Comparison with Linear Regression:**

Here's a comparison of the Random Forest results with the previous Linear Regression results you provided:

| Metric      | Platform   | Random Forest | Linear Regression |
| ----------- | ---------- | ------------- | ----------------- |
| MSE         | Facebook   | 282531.20     | 263891.36         |
| RMSE        | Facebook   | 531.54        | 513.70            |
| R-squared   | Facebook   | 0.22          | 0.27              |
| MSE         | GooglePlus | 218617.72     | 192194.94         |
| RMSE        | GooglePlus | 467.57        | 438.40            |
| R-squared   | GooglePlus | 0.14          | 0.24              |
| MSE         | LinkedIn   | 42194.76      | 34803.57          |
| RMSE        | LinkedIn   | 205.41        | 186.56            |
| R-squared   | LinkedIn   | 0.21          | 0.35              |

**Summary and Comparison:**

The Random Forest models, in this case, perform *slightly worse* than the Linear Regression models across all three platforms based on R-squared. The MSE and RMSE are also generally higher for the Random Forest, indicating larger prediction errors. While Random Forests are generally powerful and can capture non-linear relationships, in this specific scenario, with the current feature set and hyperparameters, they do not outperform the simpler linear models. The low R-squared values for both models across all platforms still point to the need for potentially more relevant features, feature engineering, or exploring other model types to better predict social media engagement. It's important to note that Random Forests have many hyperparameters that can be tuned to potentially improve performance; further experimentation and optimization might be necessary.


## Neural Network

In [38]:
import nltk
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
import pandas as pd
import re
import math

# Download stopwords (one-time download)
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def preprocess_data(df):
    """
    Preprocesses the input data for the neural network model.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        tuple: A tuple containing the preprocessed features (X) and target variables (y).
    """

    # One-Hot Encoding for Topic, Day of Week, Time of Day
    ohe_topic = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    topic_encoded = ohe_topic.fit_transform(df[['Topic']])
    topic_df = pd.DataFrame(topic_encoded, columns=ohe_topic.get_feature_names_out(['Topic']))
    df = pd.concat([df, topic_df], axis=1)

    ohe_day = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    day_encoded = ohe_day.fit_transform(df[['PublishDay']])
    day_df = pd.DataFrame(day_encoded, columns=ohe_day.get_feature_names_out(['PublishDay']))
    df = pd.concat([df, day_df], axis=1)

    ohe_time = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    time_encoded = ohe_time.fit_transform(df[['TimeOfDay']])
    time_df = pd.DataFrame(time_encoded, columns=ohe_time.get_feature_names_out(['TimeOfDay']))
    df = pd.concat([df, time_df], axis=1)

    # Scaling numerical features
    scaler = StandardScaler()
    df['Sentiment_mean_scaled'] = scaler.fit_transform(df[['Sentiment_mean']].values)

    # Feature Engineering: Source (Frequency Encoding)
    source_counts = df.groupby(['Topic', 'Source'])['Source'].count().unstack(fill_value=0)
    top_10_sources = {}
    for topic in df['Topic'].unique():
        top_10_sources[topic] = source_counts.loc[topic].nlargest(10).index.tolist()

    for topic, sources in top_10_sources.items():
        for source in sources:
            source_col = f"source_is_{topic}_{source.replace(' ', '_')}"  # More robust column names
            df[source_col] = (df['Topic'] == topic) & (df['Source'] == source)

    df["is_weekend"] = ((df['PublishDay_Saturday'] == 1) | (df['PublishDay_Sunday'] == 1))

    # Text Preprocessing (for Headline and Title)
    def preprocess_text(text):
        if isinstance(text, float) and math.isnan(text):
            return []
        text = str(text)
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = [word for word in text.split() if word not in stop_words]
        return tokens

    df['CleanedHeadline'] = df['Headline'].apply(preprocess_text)
    df['CleanedTitle'] = df['Title'].apply(preprocess_text)

    # Feature Engineering: Common Words in Headline and Title (using CountVectorizer)
    headline_vectorizer = CountVectorizer(max_features=20, ngram_range=(5, 5))
    headline_patterns = headline_vectorizer.fit_transform(df['CleanedHeadline'].apply(lambda x: " ".join(x))).toarray()
    df = pd.concat([df, pd.DataFrame(headline_patterns, columns=headline_vectorizer.get_feature_names_out())], axis=1)

    title_vectorizer = CountVectorizer(max_features=20, ngram_range=(3, 3))
    title_patterns = title_vectorizer.fit_transform(df['CleanedTitle'].apply(lambda x: " ".join(x))).toarray()
    df = pd.concat([df, pd.DataFrame(title_patterns, columns=title_vectorizer.get_feature_names_out())], axis=1)

    # Feature Selection
    feature_cols = list(df.filter(like='is_')) + list(df.filter(like='PublishDay_')) + list(df.filter(like='TimeOfDay_')) + ['Sentiment_mean_scaled'] + list(df.filter(like='source_is_')) + list(df.filter(regex=r'\b\w+(?:\s+\w+){4}\b')) + list(df.filter(regex=r'\b\w+(?:\s+\w+){2}\b')) + ["is_weekend"]

    X = df[feature_cols]
    X = X.loc[:, (X != X.iloc[0]).any()]

    return X

def build_and_evaluate_model(X, y, target):
    """
    Builds, trains, and evaluates a neural network model for the given target variable.

    Args:
        X (pandas.DataFrame): The preprocessed features.
        y (pandas.Series): The target variable.
        target (str): The name of the target variable.

    Returns:
        dict: A dictionary containing evaluation metrics (MSE, RMSE, R-squared).
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Build the Neural Network Model
    model = Sequential([
        Dense(128, activation='relu', input_shape=[X_train_scaled.shape[1]]),  # Input layer + hidden layer
        Dropout(0.3),  # Dropout for regularization
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1)  # Output layer (1 neuron for regression)
    ])

    model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

    # Train the Model with Early Stopping
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True
    )

    history = model.fit(
        X_train_scaled, y_train,
        epochs=100,  # Adjust as needed
        batch_size=32,  # Adjust as needed
        validation_split=0.2,  # Validation split
        callbacks=[early_stopping],
        verbose=0
    )

    y_pred = model.predict(X_test_scaled)
    y_pred = y_pred.flatten()
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results = {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}

    return results

# --- Main execution ---
df = data.copy()
targets = ['Facebook', 'GooglePlus', 'LinkedIn']
results = {}

# Preprocess the data
X = preprocess_data(df.copy())

for target in targets:
    y = df[target]
    results[target] = build_and_evaluate_model(X.copy(), y, target)

# Print results
for target, metrics in results.items():
    print(f"Results for {target} (Neural Network):")
    print(f"  MSE: {metrics['MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R-squared: {metrics['R-squared']:.2f}")
    print("-" * 30)

Results for Facebook (Neural Network):
  MSE: 221521.47
  RMSE: 470.66
  R-squared: 0.39
------------------------------
Results for GooglePlus (Neural Network):
  MSE: 185423.64
  RMSE: 430.61
  R-squared: 0.27
------------------------------
Results for LinkedIn (Neural Network):
  MSE: 33161.18
  RMSE: 182.10
  R-squared: 0.38
------------------------------


**Model Evaluation Results (Neural Network):**

The following metrics evaluate the performance of the Neural Network models for predicting social media engagement on Facebook, GooglePlus, and LinkedIn.

*   **Mean Squared Error (MSE):** Average of squared prediction errors. MSE = (1/n) * Σ(yᵢ - ŷᵢ)²
*   **Root Mean Squared Error (RMSE):** Square root of MSE, in the same units as the target. RMSE = sqrt(MSE)
*   **R-squared (R²):** Proportion of variance in the target explained by the model. R² = 1 - (SSres / SStot)

**Facebook:**

*   **MSE: 221839.39:** The average squared prediction error is 221839.39. This represents the average squared difference between the predicted and actual Facebook engagement counts.
*   **RMSE: 471.00:** On average, the model's predictions deviate from the actual Facebook engagement counts by approximately 471.00 units. This provides a more interpretable measure of error in the original scale of the target variable.
*   **R-squared: 0.38:** The model explains 38% of the variance in Facebook engagement. This is a moderate fit, suggesting the neural network captures a more significant portion of the factors influencing Facebook engagement compared to previous models.

**GooglePlus:**

*   **MSE: 184072.60:** The average squared prediction error is 184072.60 for GooglePlus engagement.
*   **RMSE: 429.04:** The model's predictions deviate from the actual GooglePlus engagement counts by approximately 429.04 units on average.
*   **R-squared: 0.28:** The model explains 28% of the variance in GooglePlus engagement, indicating a slightly better fit than the previous Random Forest and Linear Regression models for this platform, but still not a strong fit.

**LinkedIn:**

*   **MSE: 33276.79:** The average squared prediction error for LinkedIn engagement is 33276.79. This is the lowest MSE among the three platforms, suggesting better prediction accuracy for LinkedIn compared to Facebook and GooglePlus.
*   **RMSE: 182.42:** On average, the predictions deviate from the actual LinkedIn engagement counts by approximately 182.42 units.
*   **R-squared: 0.37:** The model explains 37% of the variance in LinkedIn engagement. This is a moderate fit and the best performance among the three platforms, suggesting that the model is relatively more effective at capturing the factors influencing LinkedIn engagement.

**Comparison with Linear Regression and Random Forest:**

Here's a comparison of the Neural Network results with the previous Linear Regression and Random Forest results:

| Metric      | Platform   | Neural Network | Random Forest | Linear Regression |
| ----------- | ---------- | -------------- | ------------- | ----------------- |
| MSE         | Facebook   | 221839.39      | 282531.20     | 263891.36         |
| RMSE        | Facebook   | 470.00         | 531.54        | 513.70            |
| R-squared   | Facebook   | 0.38           | 0.22          | 0.27              |
| MSE         | GooglePlus | 184072.60      | 218617.72     | 192194.94         |
| RMSE        | GooglePlus | 430.04         | 467.57        | 438.40            |
| R-squared   | GooglePlus | 0.28           | 0.14          | 0.24              |
| MSE         | LinkedIn   | 33276.79       | 42194.76      | 34803.57          |
| RMSE        | LinkedIn   | 182.42         | 205.41        | 186.56            |
| R-squared   | LinkedIn   | 0.37           | 0.21          | 0.35              |

**Summary and Comparison:**

The Neural Network models demonstrate a clear improvement over both Linear Regression and Random Forest models in terms of R-squared for all three platforms. The MSE and RMSE values are also generally lower for the Neural Network, indicating reduced prediction errors. This suggests that the neural network is better at capturing the complex, non-linear relationships within the data compared to the linear models and the Random Forest given the current feature set and network architecture. While the R-squared values are still not exceptionally high, the neural network represents the best performing model among the three evaluated so far, particularly for Facebook where it explains 38% of the variance. Further hyperparameter tuning (number of layers, neurons, dropout rates, optimizer, etc.) and feature engineering could potentially further improve the neural network's performance.

## LightGBM Model

In [31]:
def preprocess_text(text):
    """Preprocesses text."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    stop_words = set(["the", "and", "is", "to", "in", "it", "of", "for", "on", "with", "as", "this", "at", "by"])
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

def train_and_evaluate_lightgbm(df, target_name):
    """Trains and evaluates a LightGBM model, suppressing warnings."""
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning)  # Suppress LightGBM warnings

    df['log_' + target_name] = np.log1p(df[target_name] + 1)

    df['Year'] = pd.to_datetime(df['PublishDate']).dt.year
    df['Month'] = pd.to_datetime(df['PublishDate']).dt.month
    df['Day'] = pd.to_datetime(df['PublishDate']).dt.day
    df['Hour'] = pd.to_datetime(df['PublishDate']).dt.hour

    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)

    source_topic_counts = df.groupby(['Topic', 'Source']).size().reset_index(name='SourceCount')
    total_topic_counts = df.groupby('Topic').size().reset_index(name='TotalCount')

    df = df.merge(source_topic_counts, on=['Topic', 'Source'], how='left')
    df = df.merge(total_topic_counts, on='Topic', how='left')
    df['SourceFreq'] = df['SourceCount'] / df['TotalCount']
    df.drop(['SourceCount', 'TotalCount'], axis=1, inplace=True)

    df['CleanedHeadline'] = df['Headline'].apply(preprocess_text)
    df['CleanedTitle'] = df['Title'].apply(preprocess_text)

    headline_vectorizer = CountVectorizer(max_features=20, ngram_range=(5, 5))
    headline_patterns = headline_vectorizer.fit(df['CleanedHeadline']).get_feature_names_out()

    title_vectorizer = CountVectorizer(max_features=20, ngram_range=(3, 3))
    title_patterns = title_vectorizer.fit(df['CleanedTitle']).get_feature_names_out()

    for pattern in headline_patterns:
        df[f'Headline_{pattern}'] = df['CleanedHeadline'].str.contains(pattern).astype(int)

    for pattern in title_patterns:
        df[f'Title_{pattern}'] = df['CleanedTitle'].str.contains(pattern).astype(int)


    features = [
        'Topic', 'Hour', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos',
        'SourceFreq', 'Sentiment_mean'] + [f'Headline_{pattern}' for pattern in headline_patterns] + [f'Title_{pattern}' for pattern in title_patterns]

    X = df[features]
    y = df['log_' + target_name]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    model = LGBMRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    # Inverse transform and calculate metrics
    predictions_original = np.expm1(predictions)
    y_test_original = np.expm1(y_test)
    mse = mean_squared_error(y_test_original, predictions_original)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_original, predictions_original)
    return {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}

# Main execution (replace 'data' with your actual DataFrame)
df = data.copy()
targets = ['Facebook', 'GooglePlus', 'LinkedIn']
results = {}

for target in targets:
  metrics = train_and_evaluate_lightgbm(df.copy(), target)
  results[target] = metrics  # Store the results
  print(f"Results for {target} (LightGBM):")
  print(f"  MSE: {metrics['MSE']:.2f}")
  print(f"  RMSE: {metrics['RMSE']:.2f}")
  print(f"  R-squared: {metrics['R-squared']:.2f}")
  print("-" * 30)  # Optional separator


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 74591, number of used features: 44
[LightGBM] [Info] Start training from score 7.387639
Results for Facebook (LightGBM):
  MSE: 131908.46
  RMSE: 363.19
  R-squared: 0.63
------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 74591, number of used features: 44
[LightGBM] [Info] Start training from score 7.460173
Results for GooglePlus (LightGBM):
  MSE: 75483.17
  RMSE: 274.74
  R-square

In [32]:
print("\nSummary of all results:")
for target, metrics in results.items():
  print(f"{target}: R-squared = {metrics['R-squared']:.2f}")


Summary of all results:
Facebook: R-squared = 0.63
GooglePlus: R-squared = 0.70
LinkedIn: R-squared = 0.64


### Model Evaluation Results (LightGBM):

The following metrics evaluate the performance of the LightGBM models for predicting social media engagement on Facebook, GooglePlus, and LinkedIn.

*   **Mean Squared Error (MSE):** Average of squared prediction errors. MSE = (1/n) * Σ(yᵢ - ŷᵢ)²
*   **Root Mean Squared Error (RMSE):** Square root of MSE, in the same units as the target. RMSE = sqrt(MSE)
*   **R-squared (R²):** Proportion of variance in the target explained by the model. R² = 1 - (SSres / SStot)

**Facebook:**

*   **MSE: 131908.46:** The average squared prediction error is 131908.46. This represents the average squared difference between the predicted and actual Facebook engagement counts.
*   **RMSE: 363.19:** On average, the model's predictions deviate from the actual Facebook engagement counts by approximately 363.19 units. This provides a more interpretable measure of error in the original scale of the target variable.
*   **R-squared: 0.63:** The model explains 63% of the variance in Facebook engagement. This is a significant improvement compared to previous models, suggesting the LightGBM effectively captures a substantial portion of the factors influencing Facebook engagement.

**GooglePlus:**

*   **MSE: 75483.17:** The average squared prediction error is 75483.17 for GooglePlus engagement.
*   **RMSE: 274.74:** The model's predictions deviate from the actual GooglePlus engagement counts by approximately 274.74 units on average.
*   **R-squared: 0.70:** The model explains 70% of the variance in GooglePlus engagement, indicating a strong fit and the best performance among all platforms for LightGBM.

**LinkedIn:**

*   **MSE: 19206.81:** The average squared prediction error for LinkedIn engagement is 19206.81. This is the lowest MSE among all platforms and models, suggesting the best prediction accuracy for LinkedIn.
*   **RMSE: 138.59:** On average, the predictions deviate from the actual LinkedIn engagement counts by approximately 138.59 units.
*   **R-squared: 0.64:** The model explains 64% of the variance in LinkedIn engagement. This is a strong fit and suggests the LightGBM is effective at capturing the factors influencing LinkedIn engagement.

**Summary and Comparison:**

LightGBM models achieve state-of-the-art performance among the evaluated models. They significantly outperform previous models in terms of R-squared for all platforms. The MSE and RMSE are also considerably lower, indicating substantially reduced prediction errors. This suggests that LightGBM's gradient boosting approach effectively captures the complex non-linear relationships within the data, leading to more accurate predictions compared to linear models, Random Forests, and even the Neural Network in this specific case. While further hyperparameter tuning could potentially improve the LightGBM's performance even further, it demonstrates the power of gradient boosting algorithms for this task.

## Here's a comparison of ALL the models:

| Metric      | Platform   | Neural Network | Random Forest | Linear Regression | LightGBM |
| ----------- | ---------- | -------------- | ------------- | ----------------- |----------|
| MSE         | Facebook   | 221839.39      | 282531.20     | 263891.36         |131908.46 |
| RMSE        | Facebook   | 471.00         | 531.54        | 513.70            |363.19    |
| R-squared   | Facebook   | 0.38           | 0.22          | 0.27              |0.63      |
| MSE         | GooglePlus | 184072.60      | 218617.72     | 192194.94         |75483.17  |
| RMSE        | GooglePlus | 429.04         | 467.57        | 438.40            |274.74    |
| R-squared   | GooglePlus | 0.28           | 0.14          | 0.24              |0.70      |
| MSE         | LinkedIn   | 33276.79       | 42194.76      | 34803.57          |19206.81  |
| RMSE        | LinkedIn   | 182.42         | 205.41        | 186.56            |138.59    |
| R-squared   | LinkedIn   | 0.37           | 0.21          | 0.35              |0.64      |

In [40]:
data[targets].describe()

Unnamed: 0,Facebook,GooglePlus,LinkedIn
count,93239.0,93239.0,93239.0
mean,1709.134605,1799.756127,785.776226
std,600.936341,505.231734,228.10237
min,482.225,441.1875,134.4125
25%,1228.854167,1422.045833,579.208333
50%,1540.0625,1695.366667,801.472222
75%,2115.179167,2077.1,928.319444
max,5763.013889,4503.333333,1975.381944


Although we do not get the most accurate results for continous value prediction of popularity we are still able to improve and perform better than the Standard deviation of the target columns, which suggests our model is able to find patterns and make educated guess for the popularity of an article given its parameters.