<a href="https://colab.research.google.com/github/Aritra026/Python/blob/main/Task1_NER_Aritra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
file_path = '/content/drive/MyDrive/Colab Notebooks/data.zip'
df = pd.read_csv(file_path)


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
0,0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail
1,1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail
2,2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail
3,3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail
4,4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail


In [3]:
import pandas as pd
import numpy as np
import spacy
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

class NewsAnalyzer:
    def __init__(self):
        self.scaler = StandardScaler()
        self.raw_features = None

    def process_line(self, line):
        """Process a single line of text data"""
        parts = line.strip().split('...')
        if len(parts) >= 2:
            return {
                'text': parts[1].strip(),
                'id': parts[0].strip()
            }
        return None

    def load_data(self, text_data):
        """Load data from text input"""
        articles = []
        for line in text_data.split('**'):
            if line.strip():
                article = self.process_line(line)
                if article:
                    articles.append(article)
        return pd.DataFrame(articles)

    def extract_features(self, text):
        """Extract features from text"""
        doc = nlp(text)

        # Entity counts
        entities = {
            'person': 0, 'org': 0, 'gpe': 0,
            'date': 0, 'product': 0, 'event': 0
        }

        # Keep track of unique entities
        unique_entities = {k: set() for k in entities.keys()}

        for ent in doc.ents:
            ent_type = ent.label_.lower()
            if ent_type in entities:
                entities[ent_type] += 1
                unique_entities[ent_type].add(ent.text.lower())

        # Sentiment
        blob = TextBlob(text)

        # Create feature dictionary
        features = {
            'length': len(text.split()),
            'sentiment': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
        features.update({f'entity_{k}': v for k, v in entities.items()})
        features.update({f'unique_{k}': len(v) for k, v in unique_entities.items()})

        return features

    def prepare_data(self, df):
        """Prepare features for modeling"""
        features_list = []

        for text in df['text']:
            features = self.extract_features(text)
            features_list.append(features)

        # Store raw features
        self.raw_features = pd.DataFrame(features_list)

        # Scale features
        scaled_features = self.scaler.fit_transform(self.raw_features)
        return pd.DataFrame(scaled_features, columns=self.raw_features.columns)

    def create_visualizations(self, features_df):
        """Create and save all visualizations"""
        # Use raw features for more interpretable visualizations
        raw_df = self.raw_features

        # 1. Entity Distribution
        plt.figure(figsize=(12, 6))
        entity_cols = [col for col in raw_df.columns if col.startswith('entity_')]
        entity_means = raw_df[entity_cols].mean().sort_values(ascending=False)

        sns.barplot(x=entity_means.index, y=entity_means.values)
        plt.title('Average Named Entities per Article')
        plt.xlabel('Entity Type')
        plt.ylabel('Average Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('entity_distribution.png')
        plt.close()

        # 2. Entity Boxplot
        plt.figure(figsize=(12, 6))
        sns.boxplot(data=raw_df[entity_cols])
        plt.title('Distribution of Named Entities')
        plt.xlabel('Entity Type')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('entity_boxplot.png')
        plt.close()

        # 3. Sentiment Distribution
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        sns.histplot(raw_df['sentiment'], bins=30)
        plt.title('Distribution of Sentiment')
        plt.xlabel('Sentiment Score')

        plt.subplot(1, 2, 2)
        sns.histplot(raw_df['subjectivity'], bins=30)
        plt.title('Distribution of Subjectivity')
        plt.xlabel('Subjectivity Score')

        plt.tight_layout()
        plt.savefig('sentiment_distribution.png')
        plt.close()

        # 4. Correlation Heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(raw_df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
        plt.title('Feature Correlations')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

def main(text_data):
    """Main execution function"""
    try:
        # Initialize analyzer
        analyzer = NewsAnalyzer()

        # Load and process data
        print("Loading data...")
        df = analyzer.load_data(text_data)
        print(f"Loaded {len(df)} articles")

        # Prepare features
        print("Extracting features...")
        features_df = analyzer.prepare_data(df)
        print(f"Generated {features_df.shape[1]} features")

        # Create visualizations
        print("Creating visualizations...")
        analyzer.create_visualizations(features_df)

        # Print raw feature statistics
        print("\nRaw Feature Statistics:")
        print(analyzer.raw_features.describe())

        # Entity analysis
        entity_cols = [col for col in analyzer.raw_features.columns if col.startswith('entity_')]
        print("\nTotal Entity Counts:")
        total_entities = analyzer.raw_features[entity_cols].sum().sort_values(ascending=False)
        print(total_entities)

        print("\nMean Entities per Article:")
        mean_entities = analyzer.raw_features[entity_cols].mean().sort_values(ascending=False)
        print(mean_entities)

        print("\nAnalysis complete! Visualizations have been saved.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

    return features_df

if __name__ == "__main__":
    with open(file_path, 'r', encoding='latin1') as file:
        dataset = file.read()

    features_df = main(dataset)

Loading data...
Loaded 897 articles
Extracting features...
Generated 15 features
Creating visualizations...

Raw Feature Statistics:
             length   sentiment  subjectivity  entity_person  entity_org  \
count    897.000000  897.000000    897.000000     897.000000  897.000000   
mean    2763.373467    0.155610      0.667278     309.880713  117.094760   
std     2696.138381    0.526423      0.468447     302.660842  115.539085   
min        3.000000   -1.000000      0.000000       0.000000    0.000000   
25%      786.000000    0.000000      0.000000      88.000000   35.000000   
50%     1968.000000    0.000000      1.000000     218.000000   82.000000   
75%     3884.000000    0.500000      1.000000     441.000000  162.000000   
max    22592.000000    1.000000      1.000000    2514.000000  982.000000   

       entity_gpe  entity_date  entity_product  entity_event  unique_person  \
count  897.000000   897.000000      897.000000    897.000000     897.000000   
mean    87.008919    17.