In [1]:
import pandas as pd

from google.colab import files  #used to import dataset from system
f = files.upload()

#load the dataset in variable df
df = pd.read_csv('Airplane_Crashes_and_Fatalities_Since_1908.csv')
#gives the information about data set
print(df.info())
# remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]
# Check for duplicate rows and droping those
df = df.drop_duplicates()
# Checking for column for NaN values and droping those rows
nan_columns = df.columns[df.isna().any()].tolist()
df.dropna(inplace=True)
# Renaming columns
df.columns = df.columns.str.replace('Route', 'PathWay').str.lower()
# Conversion of values to lowercase letters
df = df.apply(lambda Location: Location.astype(str).str.lower())
# Removal of extra spaces
df = df.apply(lambda Location: Location.str.strip() if Location.dtype == "object" else Location)
# Conversion of binary values to 1 and 0
binary_columns = ['aboard']  # Specify columns with binary values
df[binary_columns] = df[binary_columns].replace({r'^-?\d+$': lambda x: int(x) if pd.notna(x) else x}, regex=True)

# Transfer cleaned data to a new table
cleaned_df = df.copy()

# Exploratory Data Analysis (EDA)
# Example: Display basic statistics
print("|")
print("|")

print(cleaned_df.info())

from google.colab import files
cleaned_df.to_csv('cleaned_Airplane_crashes.csv', index=True)
"""files.download('cleaned_Airplane_crashes.csv')
"""

Saving Airplane_Crashes_and_Fatalities_Since_1908.csv to Airplane_Crashes_and_Fatalities_Since_1908.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268 entries, 0 to 5267
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          5268 non-null   object 
 1   Time          3049 non-null   object 
 2   Location      5248 non-null   object 
 3   Operator      5250 non-null   object 
 4   Flight #      1069 non-null   object 
 5   Route         3562 non-null   object 
 6   Type          5241 non-null   object 
 7   Registration  4933 non-null   object 
 8   cn/In         4040 non-null   object 
 9   Aboard        5246 non-null   float64
 10  Fatalities    5256 non-null   float64
 11  Ground        5246 non-null   float64
 12  Summary       4878 non-null   object 
dtypes: float64(3), object(10)
memory usage: 535.2+ KB
None
|
|
<class 'pandas.core.frame.DataFrame'>
Int64Index: 944 entries, 208 to 5265
Data 

"files.download('cleaned_Airplane_crashes.csv')\n"

PHASE 3

Some more Data mutation


In [5]:

cleaned_df['date'] = pd.to_datetime(cleaned_df['date'])
cleaned_df['year'] = cleaned_df['date'].dt.year
cleaned_df['month'] = cleaned_df['date'].dt.month
cleaned_df['day'] = cleaned_df['date'].dt.day


cleaned_df['aboard'] = pd.to_numeric(cleaned_df['aboard'], errors='coerce')
cleaned_df['fatalities'] = pd.to_numeric(cleaned_df['fatalities'], errors='coerce')
cleaned_df['ground'] = pd.to_numeric(cleaned_df['ground'], errors='coerce')


cleaned_df = cleaned_df.drop(['date', 'time', 'location', 'operator', 'flight #', 'pathway', 'type', 'registration', 'cn/in', 'summary'], axis=1)

print(cleaned_df.head())

     aboard  fatalities  ground  year  month  day
208    16.0        16.0     0.0  1930      1   19
236     8.0         8.0     0.0  1931      3   31
334     5.0         5.0     0.0  1934      8   31
354    14.0         5.0     0.0  1935      5    6
365     4.0         4.0     0.0  1935      8   14


model training


In [6]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# Load your dataset (replace 'your_dataset.csv' with your actual dataset)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'label' is your target column and other columns are features
# Adjust this based on your actual column names
X = cleaned_df.drop('fatalities', axis=1)
y = cleaned_df['fatalities']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Fit the model
dt_model.fit(X_train, y_train)

# Predictions
y_pred = dt_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=1)

# Print the results
print(f"Decision Tree Accuracy: {accuracy}")
print(f"Decision Tree Precision: {precision}")
print(f"Decision Tree Recall: {recall}")
print("Decision Tree Confusion Matrix:")
print(conf_matrix)
print("Classification Report for Decision Tree:")
print(class_report)


Decision Tree Accuracy: 0.30158730158730157
Decision Tree Precision: 0.5890526581002771
Decision Tree Recall: 0.30158730158730157
Decision Tree Confusion Matrix:
[[0 0 0 ... 0 0 1]
 [0 3 2 ... 0 0 0]
 [0 1 2 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report for Decision Tree:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.25      0.27      0.26        11
         2.0       0.20      0.33      0.25         6
         3.0       0.50      0.67      0.57         6
         4.0       0.40      0.40      0.40         5
         5.0       0.50      0.60      0.55         5
         6.0       0.50      0.50      0.50         2
         7.0       0.00      0.00      0.00         1
         8.0       0.40      0.67      0.50         3
         9.0       0.50      0.25      0.33         4
        10.0       0.50      0.25      0.33         4
        11.0       0.67     

visualization


In [7]:
from sklearn.tree import export_graphviz
import graphviz
from google.colab import files

# Export the decision tree to DOT format
dot_data = export_graphviz(dt_model, out_file=None,
                           feature_names=X.columns,
                           class_names=[str(i) for i in dt_model.classes_],
                           filled=True, rounded=True, special_characters=True)

# Visualize the decision tree
graph = graphviz.Source(dot_data)

# Render the decision tree to a PDF file
pdf_filename = "decision_tree"
graph.render(pdf_filename, format="pdf", cleanup=True)

# Download the PDF file
files.download(pdf_filename + ".pdf")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>