<a href="https://colab.research.google.com/github/Ecolly/Reunited/blob/main/ai_ml_training_lost%26found.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

# Load the dataset
df = pd.read_csv('delhimetrorail.csv')

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Total items: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nFirst 5 rows:")
print(df.head())

# Checking for missing data
print("\n" + "=" * 60)
print("MISSING DATA CHECK")
print("=" * 60)
print(df.isnull().sum())

# Basic statistics
print("\n" + "=" * 60)
print("TOP 10 MOST COMMON ITEM TYPES")
print("=" * 60)
print(df['item_name'].value_counts().head(10))

print("\n" + "=" * 60)
print("TOP 10 STATIONS WITH MOST ITEMS")
print("=" * 60)
print(df['station_name'].value_counts().head(10))

# Date analysis
df['receiving_date'] = pd.to_datetime(df['receiving_date'], format='%d/%m/%Y')
print("\n" + "=" * 60)
print("DATE RANGE")
print("=" * 60)
print(f"From: {df['receiving_date'].min()}")
print(f"To: {df['receiving_date'].max()}")
print(f"Time span: {(df['receiving_date'].max() - df['receiving_date'].min()).days} days")

# Sample items by category
print("\n" + "=" * 60)
print("SAMPLE ITEMS BY CATEGORY")
print("=" * 60)
for item_type in df['item_name'].value_counts().head(5).index:
    print(f"\n{item_type}:")
    samples = df[df['item_name'] == item_type]['description'].head(3).tolist()
    for i, sample in enumerate(samples, 1):
        print(f"  {i}. {sample}")

DATASET OVERVIEW
Total items: 13713

Columns: ['item_name', 'description', 'item_quantity', 'station_name', 'receiving_date', 'receiving_time']

Data types:
item_name         object
description       object
item_quantity     object
station_name      object
receiving_date    object
receiving_time    object
dtype: object

First 5 rows:
      item_name                          description item_quantity  \
0         CHAIN                          METAL CHAIN             1   
1  EARBUDS CASE                        BLACK EARBUDS             1   
2     CARRY BAG                   OLD LADIES CLOTHES             1   
3           BAG  UTENSILS , PEN , HANGER & LUNCH BOX             1   
4        JACKET                          GREY JACKET             2   

                      station_name receiving_date receiving_time  
0                         VAISHALI     21/03/2024       11:35:00  
1              BRIG. HOSHIAR SINGH     20/03/2024       12:15:00  
2                          RITHALA     20/

# New Section

In [37]:
print("*"*70)
print(" "* 30 + "DATA CLEANING")
print("*"*70)


print(f"Total items: {len(df)}")
print(f"Missing descriptions: {df['description'].isna().sum()}")
print(f"Missing quantities: {df['item_quantity'].isna().sum()}")

print("\n\n     Fixing missing data")
print("*" * 30)

df_clean = df.copy()
df_clean['Description'] = df_clean['description'].fillna('')
print(f"✓ Filled {df['description'].isna().sum()} missing descriptions with empty string")


print("\n\n     Standardizing text")
print("*" * 30)
print("Before:")
print(df_clean['item_name'].head(3).tolist())

df_clean['item_name'] = df_clean['item_name'].str.lower().str.strip()
df_clean['description'] = df_clean['description'].str.lower().str.strip()
df_clean['station_name'] = df_clean['station_name'].str.lower().str.strip()

print("\nAfter:")
print(df_clean['item_name'].head(3).tolist())
print("✓ Converted all text to lowercase and removed extra spaces")


print("\n\n     converting dates to proper format")
print("*" * 50)

print(f"Before conversion:")
print(f"  Type: {df_clean['receiving_date'].dtype}")
print(f"  Example: {df_clean['receiving_date'].iloc[0]}")
df_clean['receiving_date'] = pd.to_datetime(
    df_clean['receiving_date'],
    format='%d/%m/%Y',
    errors='coerce')

print(f"\nAfter conversion:")
print(f"  Type: {df_clean['receiving_date'].dtype}")
print(f"  Example: {df_clean['receiving_date'].iloc[0]}")

print(f"\n✓ Date range:")
print(f"  From: {df_clean['receiving_date'].min()}")
print(f"  To: {df_clean['receiving_date'].max()}")
print(f"  Span: {(df_clean['receiving_date'].max() - df_clean['receiving_date'].min()).days} days")

print("\n\n Combining item name + description")
print("*"*70)

df_clean['full_description'] = df_clean['item_name'] + ' ' + df_clean['description']

print("Examples:")
for i in range(3):
    print(f"\n  Item {i+1}:")
    print(f"    Name: {df_clean.iloc[i]['item_name']}")
    print(f"    Description: {df_clean.iloc[i]['description']}")
    print(f"    Combined: {df_clean.iloc[i]['full_description']}")

print("\n✓ Created 'full_description' column")

print("\n\nData cleaning done!")
print("Changes made: ")
print("\n  ✓ Standardized all text to lowercase")
print("  ✓ Removed extra spaces")
print("  ✓ Converted dates to proper format")
print("  ✓ Combined item name + description")


print("\n" + "*"*70)
print(" "* 10 + "Cleaned data of the first 10 as a sample")
print("*"*70)
print(df_clean[['full_description','item_quantity','station_name','receiving_date','receiving_time']].head(10))

print(df.isnull().sum())













**********************************************************************
                              DATA CLEANING
**********************************************************************
Total items: 13713
Missing descriptions: 8208
Missing quantities: 10511


     Fixing missing data
******************************
✓ Filled 8208 missing descriptions with empty string


     Standardizing text
******************************
Before:
['CHAIN', 'EARBUDS CASE', 'CARRY BAG']

After:
['chain', 'earbuds case', 'carry bag']
✓ Converted all text to lowercase and removed extra spaces


     converting dates to proper format
**************************************************
Before conversion:
  Type: datetime64[ns]
  Example: 2024-03-21 00:00:00

After conversion:
  Type: datetime64[ns]
  Example: 2024-03-21 00:00:00

✓ Date range:
  From: 2021-12-15 00:00:00
  To: 2024-03-21 00:00:00
  Span: 827 days


 Combining item name + description
************************************************************