In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('AviationData.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
# Handle missing values
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column] = df[column].fillna(df[column].mean())
    else: 
        df[column] = df[column].fillna('Unknown')

In [None]:
#set data types
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column] = df[column].astype(float) 
    else: 
        df[column] = df[column].astype(str) 


In [None]:
# Check column data types
print(df.dtypes)

# Detect columns with mixed types
for column in df.columns:
    unique_types = df[column].apply(type).unique()
    if len(unique_types) > 1:
        print(f"Column '{column}' has mixed types: {unique_types}")

In [None]:
# Convert numerical columns to float and others to string
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        df[column] = pd.to_numeric(df[column], errors='coerce')
    else:
        df[column] = df[column].astype(str)


In [None]:
# Remove duplicates
df = df.drop_duplicates()

In [None]:
# Converting Event.Date to datetime and extract year
df['Event.Date'] = pd.to_datetime(df['Event.Date'], errors='coerce')
df['Year'] = df['Event.Date'].dt.year

# Count of accidents per year
accident_trends = df.groupby('Year')['Event.Id'].count().reset_index()
accident_trends.rename(columns={'Event.Id': 'Accident_Count'}, inplace=True)
accident_trends.head()

In [None]:
import matplotlib.pyplot as plt

# Plot accident trends over time
plt.figure(figsize=(10, 6))
plt.plot(accident_trends['Year'], accident_trends['Accident_Count'], marker='o')
plt.title('Accidents Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.grid(True)
plt.show()

In [None]:
# Count of accidents by location
location_accidents = df.groupby('Location')['Event.Id'].count().reset_index()
location_accidents.rename(columns={'Event.Id': 'Accident_Count'}, inplace=True)

# Display top 10 locations with the most accidents
top_locations = location_accidents.sort_values(by='Accident_Count', ascending=False).head(10)
print(top_locations)

In [None]:
# Bar chart for top accident locations
plt.figure(figsize=(12, 6))
plt.bar(top_locations['Location'], top_locations['Accident_Count'], color='skyblue')
plt.title('Top 10 Accident Locations')
plt.xlabel('Location')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
# Group fatalities by weather condition
fatalities_by_weather = df.groupby('Weather.Condition')['Total.Fatal.Injuries'].sum().reset_index()

print(fatalities_by_weather)

In [None]:
# Bar chart for fatalities by weather condition
plt.figure(figsize=(10, 6))
plt.bar(fatalities_by_weather['Weather.Condition'], fatalities_by_weather['Total.Fatal.Injuries'], color='orange')
plt.title('Fatalities by Weather Condition')
plt.xlabel('Weather Condition')
plt.ylabel('Total Fatalities')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Scatter plot for accident locations
plt.figure(figsize=(10, 6))
plt.scatter(df['Longitude'], df['Latitude'], alpha=0.5, c='red', label='Accidents')
plt.title('Accident Locations')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

In [None]:
# Count accidents by flight phase
flight_phase = df.groupby('Broad.phase.of.flight')['Event.Id'].count().reset_index()
flight_phase.rename(columns={'Event.Id': 'Accident_Count'}, inplace=True)

flight_phase()

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(flight_phase['Accident_Count'], labels=flight_phase['Broad.phase.of.flight'], autopct='%1.1f%%')
plt.title('Accidents by Flight Phase')
plt.show()