In [3]:
import re
import nltk
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


path = r'C:\Users\DEVA NANTHAN\Documents\ano\Articles.csv'
df = pd.read_csv(path, encoding='latin1')
df = df.drop_duplicates(subset=['Article','Heading'])
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Article']).reset_index(drop=True)

[nltk_data] Downloading package punkt to C:\Users\DEVA
[nltk_data]     NANTHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\DEVA
[nltk_data]     NANTHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\DEVA
[nltk_data]     NANTHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df.isna().sum()

Article     0
Date        0
Heading     0
NewsType    0
dtype: int64

In [5]:
import os, re, warnings
warnings.filterwarnings('ignore')

In [6]:
path = r"C:\Users\DEVA NANTHAN\Documents\ano\Articles.csv"
df = pd.read_csv(path, encoding='latin1')


df = df.drop_duplicates(subset=['Article','Heading'])
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Article']).reset_index(drop=True)

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_locations(text):
    doc = nlp(str(text))
    return list({ent.text for ent in doc.ents if ent.label_ == "GPE"})

df['location'] = df['Article'].apply(extract_locations)

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 1]
    return ' '.join(tokens)

df['Clean_Article'] = df['Article'].apply(clean_text)
df['word_count'] = df['Clean_Article'].apply(lambda x: len(x.split()))
df['year']  = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day']   = df['Date'].dt.day
df['dayofweek'] = df['Date'].dt.dayofweek

In [9]:
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
def get_sentiment(text):
    text = str(text)[:512]   
    result = sentiment_model(text)[0]
    return pd.Series([result['label'], result['score']])


df[['sentiment', 'sentiment_score']] = df['Article'].apply(get_sentiment)
label_map = {'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'}
df['sentiment'] = df['sentiment'].map(label_map)




Device set to use cpu


In [10]:
# 6. Sentence Embeddings

from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(df['Clean_Article'].tolist(), show_progress_bar=True, convert_to_numpy=True)

print("Embeddings shape:", embeddings.shape)

Batches:   0%|          | 0/81 [00:00<?, ?it/s]

Embeddings shape: (2585, 384)


In [11]:
from bertopic import BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=False)
topics, probs = topic_model.fit_transform(df['Clean_Article'].tolist(), embeddings)
df['Topic'] = topics
df['topic_probabilities'] = [None if p is None else p.tolist() for p in probs]

In [12]:
topic_info = topic_model.get_topic_info()
top_topic_map = {}
for t in topic_info.Topic:
    if t == -1:
        top_topic_map[-1] = "Outlier"
    else:
        words = topic_model.get_topic(t)
        label = " ".join([w for w, _ in words[:3]]) if words else f"Topic_{t}"
        top_topic_map[t] = label
df['topic_label'] = df['Topic'].map(top_topic_map)

In [13]:
NEWS_KEYWORDS = {
    "crime": ["murder", "robbery", "police", "arrest", "crime", "stabbing", "killed", "shooting", "investigation"],
    "festival": ["festival", "celebration", "celebrate", "parade", "fete", "garland", "pandal", "ceremony"],
    "business": ["company", "business", "market", "shares", "stock", "acquisition", "revenue", "profit"],
    "sports": ["match", "score", "tournament", "league", "goal", "cricket", "football", "olympics", "player"],
    "politics": ["election", "minister", "government", "policy", "parliament", "mp", "politics", "campaign"],
    "technology": ["tech", "startup", "ai", "machine learning", "software", "app", "gadget", "device"],
}

def auto_classify_newstype(text):
    txt = str(text).lower()
    counts = {cat: sum(1 for kw in kws if kw in txt) for cat, kws in NEWS_KEYWORDS.items()}
    counts = {k:v for k,v in counts.items() if v>0}
    if counts:
        return sorted(counts.items(), key=lambda x: (-x[1], x[0]))[0][0]
    return "other"

df['NewsType_auto'] = df['Clean_Article'].apply(auto_classify_newstype)
df['NewsType_final'] = df.apply(lambda r: r['NewsType_auto'], axis=1)

In [14]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(embeddings)


iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
iso.fit(X_scaled)
anomaly_pred = iso.predict(X_scaled)
anomaly_score = iso.decision_function(X_scaled)

df['anomaly_pred'] = anomaly_pred
df['anomaly_score'] = anomaly_score
df['is_anomaly'] = df['anomaly_pred'] == -1

In [None]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv(r'C:\Users\DEVA NANTHAN\Documents\ano\out.csv', encoding='latin1')


loc_encoder = LabelEncoder()
df['loc_encoded'] = loc_encoder.fit_transform(df['main_location'])


embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(df['Clean_Article'].astype(str).tolist())

# Combine embeddings + location
X = np.concatenate([embeddings, df['loc_encoded'].values.reshape(-1, 1)], axis=1)
y = df['is_anomaly']  


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
clf = XGBClassifier()
clf.fit(X_train, y_train)

# Save models
joblib.dump(scaler, "scaler1.pkl")
joblib.dump(clf, "XGBClassifier1.pkl")
joblib.dump(loc_encoder, "LabelEncoder1.pkl")

['LabelEncoder1.pkl']

In [16]:
df

Unnamed: 0,Article,Date,Heading,NewsType,location,Clean_Article,word_count,year,month,day,...,NewsType_auto,NewsType_final,anomaly_pred,anomaly_score,is_anomaly,main_location,loc_encoded,loc_pred,loc_confidence,loc_pred_name
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,"['Karachi', 'Sindh']",karachi sindh government decided bring public ...,72,2015,1,1,...,politics,politics,1,0.020557,False,Karachi,174,174,0.985730,Karachi
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,"['Hong Kong', 'Taiwan', 'Jakarta', 'Beijing', ...",hong kong asian market started upswing limited...,475,2015,1,2,...,business,business,1,0.042603,False,Hong Kong,140,140,0.995371,Hong Kong
2,HONG KONG: Hong Kong shares opened 0.66 perce...,2015-01-05,hong kong stocks open 0.66 percent lower,business,"['Hong Kong', 'HONG KONG']",hong kong hong kong share opened percent lower...,26,2015,1,5,...,other,other,-1,-0.003981,True,Hong Kong,140,140,0.996120,Hong Kong
3,HONG KONG: Asian markets tumbled Tuesday follo...,2015-01-06,asian stocks sink euro near nine year,business,"['Hong Kong', 'Milan', 'China', 'Greece', 'Tok...",hong kong asian market tumbled tuesday followi...,328,2015,1,6,...,politics,politics,1,0.022472,False,Hong Kong,140,140,0.995535,Hong Kong
4,NEW YORK: US oil prices Monday slipped below $...,2015-01-06,us oil prices slip below 50 a barr,business,"['NEW YORK', 'Brazil', 'Iraq', 'China', 'Russi...",new york u oil price monday slipped barrel fir...,399,2015,1,6,...,business,business,1,0.037136,False,NEW YORK,241,241,0.987267,NEW YORK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2580,strong>DUBAI: Dubai International Airport and ...,2017-03-25,Laptop ban hits Dubai for 11m weekend traveller,business,"['Emirates', 'the United Arab Emirates', 'Leba...",strong dubai dubai international airport flag ...,281,2017,3,25,...,technology,technology,1,0.012401,False,Emirates,98,98,0.026791,Emirates
2581,"strong>BEIJING: Former Prime Minister, Shaukat...",2017-03-26,Pak China relations not against any third coun...,business,"['China', 'Pakistan', 'Hainan province']",strong beijing former prime minister shaukat a...,95,2017,3,26,...,politics,politics,1,0.032582,False,China,72,72,0.994463,China
2582,strong>WASHINGTON: Uber has grounded its fleet...,2017-03-26,Uber grounds self driving cars after accid,business,"['Pittsburg', 'Montenegro', 'San Francisco', '...",strong washington uber grounded fleet self dri...,186,2017,3,26,...,crime,crime,-1,-0.010938,True,Pittsburg,286,286,0.033251,Pittsburg
2583,strong>BEIJING: The New Development Bank plans...,2017-03-27,New Development Bank plans joint investments i...,business,"['Brazil', 'China', 'Russia', 'India', 'Xiamen...",strong beijing new development bank plan co fi...,181,2017,3,27,...,politics,politics,1,0.027085,False,Brazil,56,56,0.969252,Brazil


In [17]:
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, f1_score

In [19]:
df['anomaly_prob'] = -df['anomaly_score']

In [25]:
y_true = df['is_anomaly'].astype(int)
y_pred = (df['anomaly_prob'] > np.percentile(df['anomaly_prob'], 95)).astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, df['anomaly_prob'])

# Recall@K (Top 5%)
k = int(0.05 * len(df))
top_k = df.nlargest(k, 'anomaly_prob')
recall_at_k = top_k['is_anomaly'].mean()

print("\n Evaluation Metrics (Unsupervised Simulation):")
print(f"AUC: {auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Recall@5%: {recall_at_k:.4f}")


 Evaluation Metrics (Unsupervised Simulation):
AUC: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Recall@5%: 1.0000
