In [None]:
!pip install datasets

In [None]:
import pandas as pd
from datasets import Dataset

def safe_label_parse(label_string):
    return label_string.strip("[]").replace("'", "").split()

df = pd.read_csv('/content/drive/MyDrive/clean_labeled_zeroOne/train_label_dataset.csv')

df['labels'] = df['labels'].apply(safe_label_parse)

train_dataset = Dataset.from_pandas(df)

In [None]:
import pandas as pd
from datasets import Dataset
import ast
import re

def parse_list(s):
    s = s.strip("[] ")
    if ',' in s:
        return list(map(int, s.split(',')))
    else:
        return list(map(int, s.split()))

def parse_offset_mapping(s):

    matches = re.findall(r'array\(\[(\d+),\s*(\d+)\]\)', s)
    return [[int(a), int(b)] for a, b in matches]


df_tokenized = pd.read_csv('/content/drive/MyDrive/clean_labeled_zeroOne/train_tokenized_label_dataset_clean.csv')

#Drop unnecessary columns
columns_to_drop = ['full_date', 'min_fatalities', 'max_fatalities', 'countries']
df_tokenized = df_tokenized.drop(columns=[col for col in columns_to_drop if col in df_tokenized.columns])

#Clean up
for col in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']:
    if col in df_tokenized.columns:
        df_tokenized[col] = df_tokenized[col].astype(str).str.replace('\n', ' ', regex=False)
        df_tokenized[col] = df_tokenized[col].apply(parse_list)

if 'offset_mapping' in df_tokenized.columns:
    df_tokenized['offset_mapping'] = df_tokenized['offset_mapping'].astype(str).str.replace('\n', ' ', regex=False)
    df_tokenized['offset_mapping'] = df_tokenized['offset_mapping'].apply(parse_offset_mapping)

df_tokenized = df_tokenized.reset_index(drop=True)

train_tokenized = Dataset.from_pandas(df_tokenized)

In [None]:
train_tokenized[0]

#Eval

In [None]:
import pandas as pd
from datasets import Dataset

def safe_label_parse(label_string):
    return label_string.strip("[]").replace("'", "").split()

df = pd.read_csv('/content/drive/MyDrive/clean_labeled_zeroOne/eval_label_dataset.csv')

df['labels'] = df['labels'].apply(safe_label_parse)

eval_dataset = Dataset.from_pandas(df)

In [None]:
eval_dataset[0]

In [None]:
import pandas as pd
from datasets import Dataset
import ast
import re

def parse_list(s):
    s = s.strip("[] ")
    if ',' in s:
        return list(map(int, s.split(',')))
    else:
        return list(map(int, s.split()))

def parse_offset_mapping(s):

    matches = re.findall(r'array\(\[(\d+),\s*(\d+)\]\)', s)
    return [[int(a), int(b)] for a, b in matches]


df_tokenized = pd.read_csv('/content/drive/MyDrive/clean_labeled_zeroOne/eval_tokenized_label_dataset_clean.csv')

#Drop unnecessary columns
columns_to_drop = ['full_date', 'min_fatalities', 'max_fatalities', 'countries']
df_tokenized = df_tokenized.drop(columns=[col for col in columns_to_drop if col in df_tokenized.columns])

#Clean up
for col in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']:
    if col in df_tokenized.columns:
        df_tokenized[col] = df_tokenized[col].astype(str).str.replace('\n', ' ', regex=False)
        df_tokenized[col] = df_tokenized[col].apply(parse_list)

if 'offset_mapping' in df_tokenized.columns:
    df_tokenized['offset_mapping'] = df_tokenized['offset_mapping'].astype(str).str.replace('\n', ' ', regex=False)
    df_tokenized['offset_mapping'] = df_tokenized['offset_mapping'].apply(parse_offset_mapping)

df_tokenized = df_tokenized.reset_index(drop=True)

eval_tokenized = Dataset.from_pandas(df_tokenized)

#Training

In [None]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
!pip install seqeval

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import torch
from seqeval.metrics import classification_report
import numpy as np
import pandas as pd
import re
from datetime import datetime
import spacy
id2label

nlp = spacy.load("en_core_web_sm")

label_list = [
    "O",         # Outside of any entity
    "B-DATE",    # Beginning of date
    "I-DATE",    # Inside of date
    "B-MIN_FAT", # Beginning of min fatalities
    "I-MIN_FAT", # Inside of min fatalities
    "B-MAX_FAT", # Beginning of max fatalities
    "I-MAX_FAT", # Inside of max fatalities
    "B-COUNTRY", # Beginning of country
    "I-COUNTRY"  # Inside of country
]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Tokenizer- DistilBERT for efficiency
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True)
    labels = []

    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I") else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
id2label

{0: 'O',
 1: 'B-DATE',
 2: 'I-DATE',
 3: 'B-MIN_FAT',
 4: 'I-MIN_FAT',
 5: 'B-MAX_FAT',
 6: 'I-MAX_FAT',
 7: 'B-COUNTRY',
 8: 'I-COUNTRY'}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Trainer with both train and eval datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0215,0.016866
2,0.0133,0.012267
3,0.0137,0.011554


TrainOutput(global_step=8088, training_loss=0.018810446579543585, metrics={'train_runtime': 6658.3187, 'train_samples_per_second': 19.435, 'train_steps_per_second': 1.215, 'total_flos': 1.6908912316348416e+16, 'train_loss': 0.018810446579543585, 'epoch': 3.0})

In [None]:
import pandas as pd
test_df = pd.read_csv("/content/drive/MyDrive/test_df_for_new.csv")

In [None]:
!pip install word2number

In [None]:
import re
from dateutil import parser
from word2number import w2n

def convert_number_to_words(number):
    try:
        return w2n.word_to_num(str(number))
    except:
        return None

def find_all_spans(text, target):
    matches = [m for m in re.finditer(re.escape(target.lower()), text.lower())]
    return [(m.start(), m.end()) for m in matches]

def prepare_labels(text, date, min_fat, max_fat, countries):
    doc = nlp(text)
    labels = ["O"] * len(doc)

    def match_and_tag(value, tag):
        for span_start, span_end in find_all_spans(text, value):
            span = doc.char_span(span_start, span_end)
            if span:
                labels[span.start] = f"B-{tag}"
                for i in range(span.start + 1, span.end):
                    labels[i] = f"I-{tag}"

    # --- DATE tagging ---
    try:
        parsed = datetime.strptime(date, "%Y-%m-%d")
        date_variants = [
            parsed.strftime("%B %d, %Y"),
            parsed.strftime("%b %d, %Y"),
            parsed.strftime("%d %B %Y"),
            parsed.strftime("%d %b %Y"),
            parsed.strftime("%B %d %Y"),
            parsed.strftime("%b %d %Y"),
            parsed.strftime("%Y-%m-%d"),
            parsed.strftime("%Y%m%d"),
            parsed.strftime("%d%B%Y"),
            parsed.strftime("%d%b%Y"),
        ]
        for date_str in date_variants:
            match_and_tag(date_str, "DATE")
    except Exception as e:
        pass


    for number, tag in [(min_fat, "MIN_FAT"), (max_fat, "MAX_FAT")]:
        if pd.isna(number):
            continue
        str_num = str(int(number))
        word_num = None
        try:
            word_num = w2n.word_to_num(str_num)
        except:
            pass


        match_and_tag(str_num, tag)
        try:
            word = list(w2n.american_number_system.keys())[int(str_num)]
            match_and_tag(word, tag)
        except:
            pass

    if countries and isinstance(countries, str):
        for country in countries.split(","):
            country = country.strip()
            if country:
                match_and_tag(country, "COUNTRY")

    return labels

#Scores

In [None]:
from seqeval.metrics import classification_report, f1_score

def evaluate_model(test_df, model, tokenizer):
    """Comprehensive evaluation of the trained model"""

    test_data = []
    for _, row in test_df.iterrows():
        labels = prepare_labels(
            row['cleaned_text'],
            row['full_date'],
            row['min_fatalities'],
            row['max_fatalities'],
            row['countries']
        )
        test_data.append({
            "text": row['cleaned_text'],
            "labels": labels
        })

    test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))
    test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)

    predictions = trainer.predict(test_tokenized)
    preds = np.argmax(predictions.predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in test_tokenized["labels"]]
    pred_labels = [[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
                  for prediction, label in zip(preds, test_tokenized["labels"])]


    print("Classification Report:")
    print(classification_report(true_labels, pred_labels))


    overall_f1 = f1_score(true_labels, pred_labels)
    print(f"\nOverall F1 Score: {overall_f1:.4f}")

    return true_labels, pred_labels


true_labels, pred_labels = evaluate_model(test_df, model, tokenizer)

Map:   0%|          | 0/10784 [00:00<?, ? examples/s]

Classification Report:
              precision    recall  f1-score   support

     COUNTRY       0.95      0.97      0.96     51159
        DATE       1.00      1.00      1.00     18587
     MAX_FAT       0.84      0.85      0.84      7401
     MIN_FAT       0.78      0.83      0.81      4740

   micro avg       0.94      0.96      0.95     81887
   macro avg       0.89      0.91      0.90     81887
weighted avg       0.94      0.96      0.95     81887


Overall F1 Score: 0.9498


In [None]:
model.save_pretrained("./conflict_info_extractor")
tokenizer.save_pretrained("./conflict_info_extractor")

('./conflict_info_extractor/tokenizer_config.json',
 './conflict_info_extractor/special_tokens_map.json',
 './conflict_info_extractor/vocab.txt',
 './conflict_info_extractor/added_tokens.json',
 './conflict_info_extractor/tokenizer.json')

#Manual Testing

In [None]:
def clean_text(text):
    """Basic text cleaning function"""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
import spacy
from transformers import pipeline

class ConflictInfoExtractor:
    def __init__(self, model_path="./conflict_info_extractor"):
        self.nlp = spacy.load("en_core_web_sm")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.classifier = pipeline(
            "token-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )

    def extract_info(self, text):
        """Clean and process text for extraction"""

        text = clean_text(text)

        # Get model predictions
        entities = self.classifier(text)

        # Process entities into structured format
        result = {
            "date": "",
            "min_fatalities": "",
            "max_fatalities": "",
            "countries": []
        }

        for entity in entities:
            if entity["entity_group"] == "DATE":
                result["date"] = self._format_date(entity["word"])
            elif entity["entity_group"] == "MIN_FAT":
                result["min_fatalities"] = entity["word"]
            elif entity["entity_group"] == "MAX_FAT":
                result["max_fatalities"] = entity["word"]
            elif entity["entity_group"] == "COUNTRY":
                result["countries"].append(entity["word"])

        return result

    def _format_date(self, date_str):
        """Standardize date formats"""
        try:
            for fmt in ["%B %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y"]:
                try:
                    dt = datetime.strptime(date_str, fmt)
                    return dt.strftime("%Y-%m-%d")
                except ValueError:
                    continue
            return date_str
        except:
            return date_str


extractor = ConflictInfoExtractor()

sample_text = "20051005 0013-05october 2005 ln np1.txt-files.list suicide blast hits canadian convoy afghanistan , child killed afg-can october 5 , 2005 wednesday 1123 gmt news source c agence france presse english svm score 2.764 suicide bomber pick-up truck blew near canadian military convoy volatile southern afghanistan wednesday , killing afghan child , provincial governor said . second suicide attack week insurgency-hit afghanistan , raising fears rebels influenced iraq-style tactics foreign targets . canadian force said three soldiers lightly wounded attack three-jeep convoy outskirts southern city kandahar , birthplace ousted taliban regime . kandahar province governor asadullah khalid told reporters attack targeted canadians . suicide attack one afghan boy killed , man wounded course suicide bomber also killed , told reporters . canadians part military civilian reconstruction team based kandahar since early august , teams publicity officer captain francois giroux told afp . morning convoy hit vehicle-borne explosive improvised device detonated suicide bomber . three canadian soldiers superficial injuries , mostly minor burns muscle-ache , said . told civilian fatalities details , said . giroux said one jeeps damaged attack , first reconstruction team since deployment kandahar . knew came environment hostile soldiers well-trained know job , giroux said . although suicide bombings rare afghanistan , last week ago man motorcycle blew crowd soldiers knocked duty base kabul . eight soldiers civilian killed . kabul september , two canadian soldiers suffered minor injuries blast hit jeep patrol . kandahar powerbase taliban regime toppled late 2001 waging insurgency us-backed government president hamid karzai . province one several southern eastern afghanistan seen worst increasingly deadly insurgency 1,300 people killed year , 850 last year . 250 canadians kandahar reconstruction team , giroux said . soldiers , also surgeons police officers present . teams , officially known provincial reconstruction teams , set across afghanistan build ties local officials help reconstruction war-torn region . canadian soldiers also part 10,500-strong nato force helping maintain security afghanistans capital kabul northern western parts country . str-brdkth"
print(extractor.extract_info(sample_text))

Device set to use cuda:0


{'date': '##100', 'min_fatalities': 'one', 'max_fatalities': '1300', 'countries': ['afghanistan', 'afghanistan', 'afghanistan', 'afghanistan', 'afghanistan', 'afghanistan']}


In [None]:
sample_text = "afg-irn afg-irn afg-irn afg-irn afg-irn afg-irn afg-irn afg-irn 20050505 0021-may05 2005 ln np1.txt-files.list taleban , afghan government casualty figures differ clash south afg-irn may 5 , 2005 , thursday news source c bbc monitoring south asia - political svm score 3.967 supplied bbc worldwide monitoring source voice islamic republic iran , mashhad , dari 1330 gmt , 5 may 05 excerpt report iranian radio mashhad 5 may spokesman international peacekeeping forces afghanistan says 44 people killed clash kandahar province last night . spokesman told reporter people killed affiliated taleban . coalition forces afghanistan said yesterday 20 insurgents one private killed military operation daichupan area zabol province previous day . added six us soldiers five afghan privates sustained injuries . meantime , spokesman taleban , mofti latifollah hakimi , said five taleban fighters killed clash , addition 16 civilians , including number women children . hakimi claimed eight foreign soldiers 15 afghan privates killed operation . spokesman governor zabol province said found bodies three foreigners , including two chechens one pakistani . correspondent kandahar also reported gunmen killed nine soldiers national army injured three others . passage omitted repetition details correspondent kandahar"
print(extractor.extract_info(sample_text))

{'date': '##50', 'min_fatalities': '5', 'max_fatalities': '44', 'countries': ['iran', 'afghanistan', 'afghanistan']}


In [None]:
sample_text = "20090214 0017-feb14 2009 ln np1.txt-files.list australia 5 children killed afghan battle afg-aul february 14 , 2009 saturday 138 gmt news source c associated press worldstream svm score 2.836 gunfight australian forces taliban fighters southern afghanistan killed five children caught crossfire , australian defense ministry said . afghan officials gave lower death tolls . asadullah hamdan , provincial governor , said friday three children 7 10 years old killed . fighting southern uruzgan province started raid international afghan troops compounds village insurgent leaders believed holed , nato said statement . australian defense ministry said reports five children killed four people wounded two children . provincial police chief gen. juma gul himat said reports four children killed . conflicting death tolls could resolved . one insurgent also killed , australian ministry statement said . australian troops wounded . total 1,162 civilians killed insurgency-related incidents 2008 , according ap casualty count 368 foreign afghan troops 768 taliban . another 26 caught crossfire . deaths thursday came newly appointed u.s. envoy region toured afghanistan . envoy richard holbrooke first visit country since appointed president barack obama define new strategy combat taliban afghanistan pakistan . friday , discussed counterterrorism strategy countrys two vice presidents pledged continued u.s. support , state television reported . holbrooke also expected meet president hamid karzai . karzai repeatedly warned western forces need prevent civilian deaths lose support afghan people ."
print(extractor.extract_info(sample_text))

{'date': 'february 14 2009', 'min_fatalities': 'one', 'max_fatalities': '##2', 'countries': ['australia', 'afghanistan', 'afghanistan', 'afghanistan', 'pakistan']}


In [None]:
sample_text = "key cf27d73d 4b0e 46ea 85ca e2d8b96c6c60 collection users vjdorazio desktop mid mid5 mid 5 0 lexis nexis reports mid5_2014 bbc_2014 march 2014 17 20march_2014_ln_np1 txt headline karabakh denies involvement death azeri soldier contact line date 20140317 source bbc monitoring trans caucasus unit dateline dateline found googleddateline dateline found americandateline false byline byline found language english subject subject found organization organization found geographic geographic found loaddate march 18 2014 pubtype transcript countries united kingdom 1 azerbaijan 2 armenia 4 supplied bbc worldwide monitoring text report armenian internet news agency news march 17 nagornyy karabakh defence army denied involvement death azerbaijani soldier reported azeri media nagornyy karabakh defence army remains committed cease fire regime frontline units violate truce reports azerbaijan involvement nagornyy karabakh republic armed forces death azeri soldier disinformation nagornyy karabakh defence army press secretary senor hasratyan said azerbaijani media claim soldier elvin hasanov aged 20 killed near fizuli soldier posthumously awarded third degree medal valor outstanding military service exhibited performance combat bbcm note armenia azerbaijan locked conflict armenian populated region nagornyy karabakh came armenian control war ended ceasefire 1994 peace agreement countries signed cease fire violations common occasionally bringing casualties sides accuse source news yerevan armenian 1257gmt 17 mar 14 copyright 2014 british broadcasting corporation rights reserved"
print(extractor.extract_info(sample_text))

{'date': 'march 18 2014', 'min_fatalities': '', 'max_fatalities': '20', 'countries': ['azerbaijan', 'armenia', 'united kingdom']}


# Testing Using CSV

In [None]:
import spacy
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from datetime import datetime

class ConflictInfoExtractor:
    def __init__(self, model_path="./conflict_info_extractor"):
        self.nlp = spacy.load("en_core_web_sm")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.classifier = pipeline(
            "token-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )

    def clean_text(self, text):
        return text.replace("\n", " ").strip()

    def extract_info(self, text):
        text = self.clean_text(text)
        entities = self.classifier(text)

        result = {
            "date": "",
            "min_fatalities": "",
            "max_fatalities": "",
            "countries": []
        }

        for entity in entities:
            word = entity["word"].replace("#", "")
            if entity["entity_group"] == "DATE":
                result["date"] = self._format_date(word)
            elif entity["entity_group"] == "MIN_FAT":
                result["min_fatalities"] = word
            elif entity["entity_group"] == "MAX_FAT":
                result["max_fatalities"] = word
            elif entity["entity_group"] == "COUNTRY":
                result["countries"].append(word.lower())

        result["countries"] = list(set(result["countries"]))

        return result

    def _format_date(self, date_str):
        for fmt in ["%B %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y"]:
            try:
                dt = datetime.strptime(date_str, fmt)
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                continue
        return date_str

    def process_csv(self, csv_path, text_column="text"):
        df = pd.read_csv(csv_path)
        df[text_column] = df[text_column].fillna("").astype(str)

        results = []
        for text in df[text_column]:
            info = self.extract_info(text)
            results.append(info)

        return pd.DataFrame(results)

In [None]:
extractor = ConflictInfoExtractor()
df_results = extractor.process_csv("/content/drive/MyDrive/2005.csv", text_column="text")
print(df_results.head())

Device set to use cuda:0


   date min_fatalities max_fatalities  \
0  2005                                 
1    70                                 
2  2005                                 
3    70                          dead   
4  2005            two            600   

                                           countries  
0  [play, solomon islands, australia, hill, recen...  
1  [would, australia, afghanistan, ., quite, ,, s...  
2  [australia, afghanistan, quite, howard, taken,...  
3  [s, australia, howard, 900, recent, law, troop...  
4                     [australia, afghanistan, iraq]  


In [None]:
df_results.shape

(8180, 4)

In [None]:
df_results.to_csv("/content/drive/MyDrive/test_results", index=False)

#More Cleaning

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/states2016.csv")

In [None]:
statename_column = df2['statenme']
statename_column

Unnamed: 0,statenme
0,United States of America
1,Canada
2,Bahamas
3,Cuba
4,Cuba
...,...
238,Nauru
239,Marshall Islands
240,Palau
241,Federated States of Micronesia


In [None]:
from word2number import w2n

class ConflictInfoExtractor:
    def __init__(self, model_path="./conflict_info_extractor", valid_states=None):
        self.nlp = spacy.load("en_core_web_sm")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.classifier = pipeline(
            "token-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )
        self.valid_states = set(state.lower() for state in valid_states) if valid_states else set()

    def clean_text(self, text):
        return str(text).replace("\n", " ").strip()

    def _format_date(self, date_str):
        for fmt in ["%B %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y"]:
            try:
                dt = datetime.strptime(date_str, fmt)
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                continue
        return date_str

    def _convert_to_number(self, word):
        try:

            return str(w2n.word_to_num(word))
        except:

            if word.isdigit():
                return word
            return ""


    def process_csv(self, csv_path, text_column="text"):
        df = pd.read_csv(csv_path)
        df[text_column] = df[text_column].fillna("").astype(str)

        results = []
        for text in df[text_column]:
            info = self.extract_info(text)
            results.append(info)

        return pd.DataFrame(results)

    def extract_info(self, text):
        text = self.clean_text(text)
        entities = self.classifier(text)

        result = {
            "date": "",
            "min_fatalities": "",
            "max_fatalities": "",
            "countries": []
        }
        date_parts = []

        for entity in entities:
            word = entity["word"].replace("#", "").strip(" ,.-").lower()
            if not word or word in [',', '-', '.', '']:
                continue

            if entity["entity_group"] == "DATE":
                date_parts.append(word)
            elif entity["entity_group"] == "MIN_FAT":
                result["min_fatalities"] = self._convert_to_number(word)
            elif entity["entity_group"] == "MAX_FAT":
                result["max_fatalities"] = self._convert_to_number(word)
            elif entity["entity_group"] == "COUNTRY" and word in self.valid_states:
                result["countries"].append(word)


        full_date = " ".join(date_parts).strip()
        result["date"] = self._format_date(full_date)


        result["countries"] = sorted(set(result["countries"]))
        result["min_fatalities"] = result["min_fatalities"] if result["min_fatalities"] else "0"
        result["max_fatalities"] = result["max_fatalities"] if result["max_fatalities"] else "0"
        return result


In [None]:
valid_states = df2['statenme'].dropna().str.lower().tolist()

extractor = ConflictInfoExtractor(valid_states=valid_states)

df_results = extractor.process_csv("/content/drive/MyDrive/2005.csv", text_column="text")

#cleaned version
df_results["countries"] = df_results["countries"].apply(lambda x: "; ".join(x))
df_results.to_csv("cleaned_conflict_info3.csv", index=False)

Device set to use cuda:0
