In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('../Csvs/crime.csv')

In [None]:
# 2.1 Initial Exploration
print("Dataset Shape:", df.shape)
df.head()
df.info() # Check data types and missing values
df.describe() # Summary statistics

# 2.2 Data Cleaning
# Handle missing values. A simple strategy is to fill with 0 or the mean.
df.fillna(0, inplace=True)

# Check for and remove duplicates if any
df.drop_duplicates(inplace=True)
df.columns = df.columns.str.strip().str.upper().str.replace(' ', '_').str.replace('/', '_').str.replace('&', 'AND')
# print(df.columns)

# Standardize the text: convert to lowercase, then title case
df['STATE_UT'] = df['STATE_UT'].str.strip().str.title()

# Now, handle specific cases that title() doesn't fix correctly.
# For example, "A & N Islands" -> "A & N Islands", but we might want "Andaman & Nicobar Islands"
# This is a common manual mapping process.

# Create a dictionary to map incorrect variations to the correct, standard name
state_correction_map = {
    'A & N Islands': 'Andaman & Nicobar Islands',
    'A&N Islands': 'Andaman & Nicobar Islands', # if it exists
    'D & N Haveli': 'Dadra & Nagar Haveli',
    'D&N Haveli': 'Dadra & Nagar Haveli',
    'Delhi Ut': 'Delhi',
    'Nct Of Delhi': 'Delhi',
}

# Apply the mapping to fix specific cases
df['STATE_UT'] = df['STATE_UT'].replace(state_correction_map)

# Print the results to verify
print("\nAfter cleaning:")
print(df['STATE_UT'].nunique()) # This number should be LOWER now
df # The list should look much cleaner
# Verify cleanup
# print("Missing values after cleaning:\n", df.isnull().sum())

In [None]:
# Create a composite 'CRIMES_AGAINST_WOMEN' column for crimes against women
# List of columns related to crimes
women_crime_columns = [
    'RAPE',
    'KIDNAPPING_AND_ABDUCTION_OF_WOMEN_AND_GIRLS',
    'DOWRY_DEATHS',
    'ASSAULT_ON_WOMEN_WITH_INTENT_TO_OUTRAGE_HER_MODESTY',
    'INSULT_TO_MODESTY_OF_WOMEN',
    'CRUELTY_BY_HUSBAND_OR_HIS_RELATIVES'
]

# Sum the columns to create the new feature. Use .fillna(0) to avoid issues if any value is missing.
df['CRIMES_AGAINST_WOMEN'] = df[women_crime_columns].fillna(0).sum(axis=1)

# Handle missing values in critical columns
df['TOTAL_IPC_CRIMES'] = df['TOTAL_IPC_CRIMES'].fillna(0)
df['YEAR'] = df['YEAR'].fillna(0).astype(int) # Ensure year is integer

# Check the result
print(df[['STATE_UT', 'YEAR', 'TOTAL_IPC_CRIMES', 'CRIMES_AGAINST_WOMEN']].head())

In [None]:
#Question 1
# Group by State and sum the total IPC crimes
statewise_total = df.groupby('STATE_UT')['TOTAL_IPC_CRIMES'].sum().sort_values(ascending=False)
# Plot the top 10
plt.figure(figsize=(14, 6))
statewise_total.head(10).plot(kind='bar', color='green')
plt.title('Top 10 States/UTs with Highest Total IPC Crimes (All Years)')
plt.ylabel('Total Crimes Reported')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Question 2: How have crimes changed from 2001 to 2012?
# Group by Year and sum the total IPC crimes
national_trend = df.groupby('YEAR')['TOTAL_IPC_CRIMES'].sum()
# Plot the trend line
plt.figure(figsize=(12, 6))
plt.plot(national_trend.index, national_trend.values, marker='o', linestyle='-', linewidth=2)
plt.title('National Trend of Total IPC Crimes (2001-2012)')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.grid(True)
plt.show()
# Calculate overall percentage change
start = national_trend.min() # First year might not be 2001 if data is missing
end = national_trend.max()
pct_change = ((end - start) / start) * 100
print(f"Overall % Change from {national_trend.index[0]} to {national_trend.index[-1]}: {pct_change:.2f}%")

In [None]:
# Question 3: Crimes against women, where are they rising?
# Part 1: National Trend for crimes against women
women_national_trend = df.groupby('YEAR')['CRIMES_AGAINST_WOMEN'].sum()
plt.figure(figsize=(12, 6))
plt.plot(women_national_trend.index, women_national_trend.values, marker='s', color='purple')
plt.title('National Trend of Crimes Against Women (2001-2012)')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.grid(True)
plt.show()

# Part 2: Find states with the highest percentage increase in crimes against women
pivot_df_women = df.pivot_table(index='STATE_UT', columns='YEAR', values='CRIMES_AGAINST_WOMEN', aggfunc='sum', fill_value=0)
start_year = 2001 # Assuming your data starts here
end_year = 2012   # Assuming your data ends here
pivot_df_women['PCT_CHANGE'] = ((pivot_df_women[end_year] - pivot_df_women[start_year]) / pivot_df_women[start_year]) * 100

rising_states_women = pivot_df_women['PCT_CHANGE'].replace([np.inf, -np.inf], np.nan).dropna().sort_values(ascending=False).head(10)

plt.figure(figsize=(14, 6))
rising_states_women.plot(kind='bar', color='orange')
plt.title(f'Top 10 States/UTs with Highest % Increase in Crimes Against Women ({start_year}-{end_year})')
plt.ylabel('Percentage Increase %')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Question 4: Reuse the national trend from Q2
print(f"Total IPC Crime trend change: {pct_change:.2f}%") # From Question 2

# Find states with the biggest decrease in TOTAL IPC CRIMES
state_pivot = df.pivot_table(index='STATE_UT', columns='YEAR', values='TOTAL_IPC_CRIMES', aggfunc='sum', fill_value=0)
state_pivot['PCT_CHANGE'] = ((state_pivot[end_year] - state_pivot[start_year]) / state_pivot[start_year]) * 100

# Get the top 5 states with the largest decrease (most negative percentage)
decreasing_states = state_pivot['PCT_CHANGE'].replace([np.inf, -np.inf], np.nan).dropna().sort_values().head(5)

plt.figure(figsize=(12, 6))
decreasing_states.plot(kind='bar', color='green')
plt.title(f'Top 5 States/UTs with Largest Decrease in Total Crime ({start_year}-{end_year})')
plt.ylabel('Percentage Decrease %')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Horizontal Bar Plot for Top 10 States/UTs by Total IPC Crimes
plt.figure(figsize=(12, 6))

# Group by state, sum the crimes, sort, and take the top 10
top_states = df.groupby('STATE_UT')['TOTAL_IPC_CRIMES'].sum().sort_values(ascending=False).head(10)

# Create the bar plot
top_states.plot(kind='bar', color='maroon', edgecolor='black')

plt.title('Top 10 States/UTs by Total IPC Crimes (All Years)', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('State / Union Territory', fontsize=12)
plt.ylabel('Total Crimes Reported', fontsize=12)
plt.xticks(rotation=45, ha='right') # Rotate labels for readability

# Add value labels on top of each bar
for i, value in enumerate(top_states.values):
    plt.text(i, value + 1000, f'{value:,.0f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Line Plot for National Trend of Total IPC Crimes (2001-2012)
plt.figure(figsize=(14, 7))

# Group by year and sum the crimes for the entire country
national_trend = df.groupby('YEAR')['TOTAL_IPC_CRIMES'].sum()
print(national_trend)
# Create the line plot
plt.plot(national_trend.index, national_trend.values, marker='o', linestyle='-', linewidth=3, markersize=8, color='royalblue')

plt.title('National Trend of Total IPC Crimes (2001-2012)', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Annotate the start and end values
start_year = national_trend.index[0]
end_year = national_trend.index[-1]
plt.annotate(f'{national_trend.iloc[0]:,}', xy=(start_year, national_trend.iloc[0]), xytext=(5, 10), 
             textcoords='offset points', fontsize=10, color='darkblue')
plt.annotate(f'{national_trend.iloc[-1]:,}', xy=(end_year, national_trend.iloc[-1]), xytext=(5, 10), 
             textcoords='offset points', fontsize=10, color='darkblue')

plt.tight_layout()
plt.show()

In [None]:
#heatmap for Crime Distribution by State/UT and Year
plt.figure(figsize=(16, 20)) # Large size to fit all states

# Create a pivot table: States vs Years, values are total crimes
pivot_table = df.pivot_table(index='STATE_UT', columns='YEAR', values='TOTAL_IPC_CRIMES', aggfunc='sum', fill_value=0)

# Create the heatmap
sns.heatmap(pivot_table, 
            cmap='YlOrRd',  # Yellow-Orange-Red color scale
            annot=False,     # Set to True to see numbers, but it gets crowded
            fmt='d',
            linewidths=0.5,
            cbar_kws={'label': 'Total IPC Crimes', 'shrink': 0.8}
           )

plt.title('Heatmap of IPC Crimes by State/UT and Year', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('State / Union Territory', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# horizontal for crime count
# Select only the columns that represent specific crime types (exclude totals, metadata)
# This list will include columns like 'MURDER', 'RAPE', 'KIDNAPPING_&_ABDUCTION', 'THEFT', etc.
# Exclude: 'STATE_UT', 'DISTRICT', 'YEAR', 'TOTAL_IPC_CRIMES'
crime_columns = [col for col in df.columns if col not in ['STATE_UT', 'DISTRICT', 'YEAR', 'TOTAL_IPC_CRIMES', 'CRIMES_AGAINST_WOMEN']]

# Sum each crime type across all states and all years
crime_totals = df[crime_columns].sum().sort_values(ascending=False)

plt.figure(figsize=(14, 10))

# Create a horizontal bar plot for the top 15 crime types
crime_totals.head(15).sort_values().plot(kind='barh', color='steelblue', edgecolor='black')

plt.title('Top 15 Most Prevalent Types of Crime (All States, All Years)', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Total Count', fontsize=12)
plt.ylabel('Type of Crime', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Add value labels to the end of each bar
for i, value in enumerate(crime_totals.head(15).sort_values().values):
    plt.text(value + 1000, i, f'{value:,.0f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()