<a href="https://colab.research.google.com/github/AnkitKumarIISERB/Hachathon-Ethos-/blob/main/production.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===========================================
# CAMPUS ENTITY RESOLUTION & SECURITY SYSTEM
# Google Colab End-to-End Notebook
# ===========================================

# 1️⃣ SETUP & IMPORTS
# -------------------
!pip install fuzzywuzzy python-Levenshtein markovify scikit-learn networkx tqdm

import pandas as pd
import numpy as np
import os
import re
import networkx as nx
from fuzzywuzzy import fuzz, process
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import markovify
import joblib
import warnings
warnings.filterwarnings('ignore')

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting markovify
  Downloading markovify-0.9.4-py3-none-any.whl.metadata (23 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting unidecode (from markovify)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 2️⃣ MOUNT DRIVE (IF USING DRIVE)
# --------------------------------
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR = "/content/drive/MyDrive/campus_data"   # <-- place your 8 CSVs here


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 3️⃣ COLUMN MAPPING (handles inconsistent naming)
# ------------------------------------------------
COLUMN_MAPPING = {
    'student_id': 'entity_id',
    'user_id': 'entity_id',
    'person_id': 'entity_id',
    'card_id': 'card_id',
    'face_id': 'face_id',
    'device_hash': 'device_id',
    'location_id': 'location_id',
    'loc_id': 'location_id',
    'timestamp': 'timestamp',
    'time': 'timestamp',
    'datetime': 'timestamp',
    'email_id': 'email',
    'mail': 'email',
    'full_name': 'name',
    'user_name': 'name'
}

def normalize_columns(df):
    df = df.rename(columns={c: COLUMN_MAPPING.get(c.lower(), c.lower()) for c in df.columns})
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    return df

In [4]:

# 4️⃣ LOAD ALL CSVs
# -----------------
def load_csv(name):
    path = os.path.join(DATA_DIR, name)
    df = pd.read_csv(path)
    df = normalize_columns(df)
    df['source'] = name
    return df

files = [
    "card_swipes.csv",
    "cctv_frames.csv",
    "face_embeddings.csv",
    "free_text_notes.csv",
    "lab_bookings.csv",
    "library_checkouts.csv",
    "profiles.csv",
    "wifi_associations_logs.csv"
]

dataframes = {f: load_csv(f) for f in files}

In [5]:
# 5️⃣ BASIC CLEANING
# ------------------
for name, df in dataframes.items():
    df.drop_duplicates(inplace=True)
    if 'timestamp' in df.columns:
        df = df.sort_values('timestamp')
    dataframes[name] = df

In [6]:
print("Building entity graph for resolution...")

G = nx.Graph()

for name, df in tqdm(dataframes.items()):
    # Ensure no duplicate column names
    df.columns = df.columns.map(str)
    df = df.loc[:, ~df.columns.duplicated()]

    id_cols = ['entity_id', 'card_id', 'face_id', 'device_id', 'email']
    id_cols = [c for c in id_cols if c in df.columns]

    for _, row in df.iterrows():
        identifiers = []
        for col in id_cols:
            try:
                # Safely extract scalar value
                val = row[col]
                if isinstance(val, (list, dict, pd.Series)):
                    continue
                if pd.notna(val) and str(val).strip():
                    identifiers.append(str(val).strip())
            except Exception:
                continue  # skip invalid entries gracefully

        # Connect all IDs found in this record
        if len(identifiers) > 1:
            for i in range(len(identifiers)):
                for j in range(i + 1, len(identifiers)):
                    G.add_edge(identifiers[i], identifiers[j], source=name)

print(f"✅ Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


Building entity graph for resolution...


100%|██████████| 8/8 [00:04<00:00,  1.64it/s]

✅ Graph built with 30860 nodes and 62000 edges





In [7]:
# 7️⃣ CREATE ENTITY GROUPS
# ===========================================

entity_groups = list(nx.connected_components(G))
print(f"Total connected components (entities): {len(entity_groups)}")

entity_map = {}
for i, group in enumerate(entity_groups):
    for node in group:
        entity_map[node] = f"E{i+1}"

Total connected components (entities): 4860


In [8]:
# 8️⃣ APPLY ENTITY MAPPING TO ALL DATAFRAMES
# ===========================================
import numpy as np

for name, df in dataframes.items():
    if 'resolved_entity' not in df.columns:
        df['resolved_entity'] = np.nan

    for key in ['entity_id', 'card_id', 'face_id', 'device_id', 'email']:
        if key in df.columns:
            # force Series (handle duplicate columns or bad read)
            col_data = df[key]
            if isinstance(col_data, pd.DataFrame):
                # if there are duplicate columns, take the first one
                col_data = col_data.iloc[:, 0]

            # ensure type conversion
            mapped_vals = col_data.astype(str).map(entity_map)

            # update resolved_entity only where it’s still NaN
            df.loc[df['resolved_entity'].isna(), 'resolved_entity'] = mapped_vals

    dataframes[name] = df

print("✅ Successfully mapped resolved_entity for all datasets.")


✅ Successfully mapped resolved_entity for all datasets.


In [9]:
# 9️⃣ MERGE ALL SOURCES INTO A MASTER DATAFRAME
# ===========================================

# ===========================================
# 🧩 Step: Clean column names and reset indexes before merging
# ===========================================
cleaned_dfs = []
for name, df in dataframes.items():
    # 1️⃣ Reset index to avoid duplicate index labels
    df = df.reset_index(drop=True)

    # 2️⃣ Remove duplicate column names (keep first)
    df = df.loc[:, ~df.columns.duplicated()].copy()

    # 3️⃣ Add source column to track origin
    df["source"] = name

    # 4️⃣ Standardize timestamp column if missing
    if "timestamp" not in df.columns:
        for alt in ["time", "date_time", "datetime"]:
            if alt in df.columns:
                df.rename(columns={alt: "timestamp"}, inplace=True)
                break

    cleaned_dfs.append(df)

# ===========================================
# 🧩 Step: Merge all cleaned DataFrames safely
# ===========================================
merged_df = pd.concat(cleaned_dfs, ignore_index=True, sort=False)

# ===========================================
# 🕒 Convert and sort timestamps
# ===========================================
merged_df["timestamp"] = pd.to_datetime(merged_df["timestamp"], errors="coerce")
merged_df = merged_df.sort_values(["resolved_entity", "timestamp"], ignore_index=True)

print("✅ All datasets successfully merged into merged_df")
print(f"Final merged shape: {merged_df.shape}")



merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], errors='coerce')
merged_df = merged_df.sort_values(['resolved_entity', 'timestamp'])
merged_df.reset_index(drop=True, inplace=True)

print(f"✅ Unified dataset shape: {merged_df.shape}")
print(f"Unique resolved entities: {merged_df['resolved_entity'].nunique()}")


✅ All datasets successfully merged into merged_df
Final merged shape: (57973, 37)
✅ Unified dataset shape: (57973, 37)
Unique resolved entities: 4860


In [10]:
# 🔟 FUZZY NAME & EMAIL MATCHING
# ===========================================

from fuzzywuzzy import process

def fuzzy_clean_column(df, column):
    """Normalize a text column using fuzzy matching for near-duplicates."""
    if column not in df.columns:
        return df
    df[column] = df[column].fillna('').astype(str)
    unique_vals = [v for v in df[column].unique() if v and len(v) > 2]
    canonical = {}
    for val in unique_vals:
        if val not in canonical:
            matches = process.extract(val, unique_vals, limit=3)
            for m, score in matches:
                if score > 90:
                    canonical[m] = val
    df[column] = df[column].map(lambda x: canonical.get(x, x))
    return df

merged_df = fuzzy_clean_column(merged_df, 'name')
merged_df = fuzzy_clean_column(merged_df, 'email')

print("✅ Fuzzy normalization of names/emails completed.")

✅ Fuzzy normalization of names/emails completed.


In [11]:
# 11️⃣ TIMELINE GENERATION
# ===========================================

timeline_df = merged_df.groupby('resolved_entity').apply(
    lambda x: x.sort_values('timestamp')[['timestamp', 'location_id', 'source']].to_dict('records')
).reset_index().rename(columns={0: 'timeline'})

print(f"✅ Generated timelines for {len(timeline_df)} entities")


✅ Generated timelines for 4860 entities


In [12]:
# 12️⃣ FEATURE ENGINEERING FOR ML
# ===========================================

df_ml = merged_df.copy()
df_ml = df_ml.dropna(subset=['resolved_entity', 'timestamp'])
df_ml['hour'] = df_ml['timestamp'].dt.hour
df_ml['dayofweek'] = df_ml['timestamp'].dt.dayofweek
df_ml['month'] = df_ml['timestamp'].dt.month
df_ml['location_id'] = df_ml['location_id'].astype(str)

from sklearn.preprocessing import LabelEncoder
le_loc = LabelEncoder()
df_ml['loc_encoded'] = le_loc.fit_transform(df_ml['location_id'])

# Prepare dataset for supervised learning
X = df_ml[['hour', 'dayofweek', 'month', 'loc_encoded']]
y = df_ml['loc_encoded'].shift(-1).fillna(df_ml['loc_encoded'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Training / Testing datasets prepared.")

✅ Training / Testing datasets prepared.


In [13]:
# 13️⃣ RANDOM FOREST MODEL FOR NEXT LOCATION
# ===========================================

rf = RandomForestClassifier(n_estimators=120, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
print("\n🎯 Random Forest Performance:\n")
print(classification_report(y_test, preds))

importances = pd.Series(rf.feature_importances_, index=X.columns)
print("\nFeature Importances:\n", importances)



🎯 Random Forest Performance:

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       302
         1.0       0.09      0.01      0.02       312
         2.0       0.05      0.01      0.01       312
         3.0       0.07      0.01      0.01       324
         4.0       0.00      0.00      0.00       304
         5.0       0.04      0.00      0.01       311
         6.0       0.09      0.01      0.01       324
         7.0       0.12      0.01      0.02       297
         8.0       0.64      0.96      0.77      4357

    accuracy                           0.62      6843
   macro avg       0.12      0.11      0.09      6843
weighted avg       0.43      0.62      0.49      6843


Feature Importances:
 hour           0.492002
dayofweek      0.187470
month          0.044763
loc_encoded    0.275766
dtype: float64


In [15]:
print("\n✅ Building Markov chain transition probabilities...")

# Ensure we have a clean column for locations or states
if 'loc_encoded' not in merged_df.columns:
    # fallback: use location_encoded or location_id if available
    for alt in ['location_encoded', 'location_id', 'location']:
        if alt in merged_df.columns:
            merged_df['loc_encoded'] = merged_df[alt].astype(str)
            break

# Sort data by entity & timestamp to ensure correct transition sequence
merged_df = merged_df.sort_values(['resolved_entity', 'timestamp']).reset_index(drop=True)

# Build transition pairs
merged_df['next_loc'] = merged_df.groupby('resolved_entity')['loc_encoded'].shift(-1)

# Drop invalid rows
transitions_df = merged_df.dropna(subset=['loc_encoded', 'next_loc'])

# Compute transition counts
transitions = transitions_df.groupby(['loc_encoded', 'next_loc']).size().unstack(fill_value=0)

# Normalize to get probabilities
markov_matrix = transitions.div(transitions.sum(axis=1), axis=0)

print("✅ Markov transition matrix built successfully.")
print(f"Matrix shape: {markov_matrix.shape}")
markov_matrix.head()



✅ Building Markov chain transition probabilities...
✅ Markov transition matrix built successfully.
Matrix shape: (9, 9)


next_loc,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,nan
loc_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,0.017013,0.018904,0.013233,0.017013,0.013233,0.005671,0.022684,0.015123,0.877127
1.0,0.01306,0.018657,0.022388,0.005597,0.024254,0.01306,0.018657,0.016791,0.867537
2.0,0.017857,0.013889,0.015873,0.019841,0.021825,0.009921,0.011905,0.011905,0.876984
3.0,0.015267,0.013359,0.01145,0.022901,0.019084,0.015267,0.003817,0.020992,0.877863
4.0,0.018797,0.026316,0.018797,0.011278,0.020677,0.016917,0.011278,0.020677,0.855263


In [16]:
def predict_next_location(current_loc):
    if current_loc not in markov_matrix.index:
        return None
    probs = markov_matrix.loc[current_loc]
    return probs.idxmax()  # most likely next location

example_loc = merged_df['loc_encoded'].dropna().sample(1).iloc[0]
pred = predict_next_location(example_loc)
print(f"From location '{example_loc}', likely next location: {pred}")


From location 'nan', likely next location: nan


In [17]:
# 15️⃣ SAVE OUTPUTS
# ===========================================

output_path = os.path.join(DATA_DIR, "merged_entity_timeline.csv")
merged_df.to_csv(output_path, index=False)

joblib.dump(rf, os.path.join(DATA_DIR, "rf_model.pkl"))
joblib.dump(markov_matrix, os.path.join(DATA_DIR, "markov_model.pkl"))

print(f"\n✅ All done! Outputs saved to {DATA_DIR}")



✅ All done! Outputs saved to /content/drive/MyDrive/campus_data
