<a href="https://colab.research.google.com/github/AbhinayaReddyMalapati/calculator/blob/main/NER_and_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print(data.columns)


Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import spacy
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset
file_path = "/content/gossipcop_fake.csv"
data = pd.read_csv(file_path)

# Display dataset info
print("Dataset Columns:", data.columns)
print("First 5 Rows of Dataset:\n", data.head())

# Identify the text column (replace 'text_column' with the actual column name containing text data)
# Uncomment the line below after inspecting the columns:
# text_column = 'text_column'  # Replace this with the actual column name
text_column = data.columns[0]  # Default to the first column if unsure
print(f"Using '{text_column}' as the text column.")

# Text Preprocessing
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', str(text))  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text.lower()  # Normalize text to lowercase

data['cleaned_text'] = data[text_column].apply(preprocess_text)
print("Cleaned Text (First 5 Rows):\n", data[['cleaned_text']].head())

# Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entity_counts = {'ORG': 0, 'PERSON': 0, 'GPE': 0}
    for ent in doc.ents:
        if ent.label_ in entity_counts:
            entity_counts[ent.label_] += 1
    return entity_counts

data['entities'] = data['cleaned_text'].apply(extract_entities)
print("Extracted Entities (First 5 Rows):\n", data[['entities']].head())

data['org_count'] = data['entities'].apply(lambda x: x['ORG'])
data['person_count'] = data['entities'].apply(lambda x: x['PERSON'])
data['gpe_count'] = data['entities'].apply(lambda x: x['GPE'])
print("Entity Counts (First 5 Rows):\n", data[['org_count', 'person_count', 'gpe_count']].head())

# Feature Engineering
data['article_length'] = data['cleaned_text'].apply(lambda x: len(x.split()))
data['sentiment'] = data['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
print("Article Length and Sentiment (First 5 Rows):\n", data[['article_length', 'sentiment']].head())

# Assuming there is an engagement metric column, replace 'engagement_metric' with the correct column name
if 'engagement_metric' in data.columns:
    y = data['engagement_metric']
    print("Engagement Metric Summary:\n", y.describe())
else:
    print("No engagement metric found. Add appropriate column or modify this step.")
    y = None

# Define features
feature_columns = ['org_count', 'person_count', 'gpe_count', 'article_length', 'sentiment']
X = data[feature_columns]
print("Feature Set (First 5 Rows):\n", X.head())

# Model Training and Evaluation (if engagement metric is available)
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
    print("R2 Score:", r2_score(y_test, y_pred))

    # Visualization
    # Bar chart for entity frequencies
    entity_totals = data[['org_count', 'person_count', 'gpe_count']].sum()
    print("Entity Totals:\n", entity_totals)
    entity_totals.plot(kind='bar', title='Entity Frequency in Articles')
    plt.show()

    # Scatter plot: Sentiment vs Engagement
    plt.scatter(data['sentiment'], y)
    plt.title('Sentiment vs Engagement')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Engagement Metric')
    plt.show()

    # Heatmap of Feature Correlation
    correlation_matrix = data[feature_columns + ['engagement_metric']].corr()
    print("Feature Correlation Matrix:\n", correlation_matrix)
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Feature Correlation')
    plt.show()
else:
    print("Engagement metric is not available. Model training and evaluation skipped.")


Dataset Columns: Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')
First 5 Rows of Dataset:
                      id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   
2   gossipcop-941805037  variety.com/2017/biz/news/tax-march-donald-tru...   
3  gossipcop-2547891536  www.dailymail.co.uk/femail/article-3499192/Do-...   
4  gossipcop-5476631226  variety.com/2018/film/news/list-2018-oscar-nom...   

                                               title  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   
2  Celebrities Join Tax March in Protest of Donal...   
3  Cindy Crawford's daughter Kaia Gerber wears a ...   
4      Full List of 2018 Oscar Nominations – Variety   

                                           tweet_ids  
0  284329075902926848\t284332744