In [1]:
import pandas as pd
import numpy as np
import geocoder
import re
from datetime import datetime

In [2]:
# Load Dataset
df = pd.read_csv("World Important Dates.csv")

In [3]:
# Since they contain unknown data
df = df.drop(columns=['Date', 'Month'])

In [4]:
# Filter out rows where 'Year' contains non-numeric characters
df = df[df["Year"].astype(str).str.match(r"^\d+$")]

In [5]:
print(df)

      Sl. No                      Name of Incident  Year Country  \
2          6  Establishment of the Delhi Sultanate  1206   India   
3          7                     Battle of Panipat  1526   India   
4          8          Establishment of British Raj  1858   India   
5          9                    Partition of India  1947   India   
6         10      IndoUnknownPakistani War of 1971  1971   India   
...      ...                                   ...   ...     ...   
1091    1147         First Mexican Empire Declared  1821  Mexico   
1092    1148                U.S.UnknownMexican War  1846  Mexico   
1093    1149                           Reform Wars  1857  Mexico   
1094    1150         French Intervention in Mexico  1862  Mexico   
1095    1151                    Mexican Revolution  1910  Mexico   

              Type of Event       Place Name  \
2                 Political            Delhi   
3                    Battle          Panipat   
4                  Colonial      Whole 

In [6]:
# 3. Geocode Locations based on Country
def get_coordinates(place_name):
    g = geocoder.arcgis(place_name)
    return g.latlng if g.latlng else [None, None]

df[["Latitude", "Longitude"]] = df["Country"].apply(lambda x: pd.Series(get_coordinates(x)))

In [7]:
# 4. Clean Text Data for outcome and impact
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-zA-Z0-9., ]", "", text)  # Remove special characters
    return text

df["Cleaned Outcome"] = df["Outcome"].apply(clean_text)
df["Cleaned Impact"] = df["Impact"].apply(clean_text)
df["Cleaned Incident"] = df["Name of Incident"].apply(clean_text)
df["Cleaned Event"] = df["Type of Event"].apply(clean_text)
df["Cleaned Group"] = df["Important Person/Group Responsible"].apply(clean_text)

In [8]:
# 5. Structure Data for ArangoDB
df.rename(columns={
    "Sl. No": "_key",  # Unique identifier
    "Country": "Location",
}, inplace=True)

data_for_arangodb = df[["_key", "Cleaned Incident", "Year", "Cleaned Event", "Location", "Latitude", "Longitude", "Cleaned Impact", "Cleaned Group", "Cleaned Outcome"]]

In [9]:
# Save Processed Data
data_for_arangodb.to_csv("processed_dataset.csv", index=False)

print("Preprocessing completed. Data saved as 'processed_dataset.csv'")


Preprocessing completed. Data saved as 'processed_dataset.csv'
