Setup Environment & Install Dependencies

Libraries

In [14]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

: 

 Load and Explore Data

In [None]:
# Set the file path for the CSV file
file = 'Processed_SpaceX_Launch_Dataset.csv'

# Check if the file exists
if os.path.exists(file):
    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(file, encoding='utf-8')
    # Display the first few rows of the DataFrame
    display(df.head())
else:
    print(f"❌ File not found: {file}")

# Check for missing values
print("\nMissing Values in Dataset:")
print(df.isnull().sum())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())


Data Cleaning & Preprocessing

In [None]:
# Drop irrelevant columns explicitly mentioned
columns_to_drop = ['fairings']

# Identify columns with more than 50% missing data
missing_threshold = 0.5  # 50% threshold
cols_to_drop = df.columns[df.isnull().mean() > missing_threshold].tolist()

# Combine both drop lists and remove from DataFrame
all_columns_to_drop = list(set(columns_to_drop + cols_to_drop))

# Ensure columns to drop exist in the DataFrame
all_columns_to_drop = [col for col in all_columns_to_drop if col in df.columns]

df_cleaned = df.drop(columns=all_columns_to_drop, axis=1)

# Handling missing values (replace NaNs with median values)
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].median())

# Convert categorical variables into numerical values using one-hot encoding
categorical_cols = ['Launch Site', 'Weather Condition']
existing_categorical_cols = [col for col in categorical_cols if col in df_cleaned.columns]
if existing_categorical_cols:
	df_cleaned = pd.get_dummies(df_cleaned, columns=existing_categorical_cols, drop_first=True)

# Display cleaned data
display(df_cleaned.head())

# Print dropped columns for verification
print(f"\n✅ Dropped Columns: {all_columns_to_drop}")
print(f"✅ Remaining Columns: {df_cleaned.columns.tolist()}")


Exploratory Data Analysis (EDA)

In [None]:
# Visualizing Landing Success
plt.figure(figsize=(8, 5))
sns.countplot(x=df['success'])
plt.title("Success Rate of Falcon 9 Landings")
plt.show()

# Ensure 'static_fire_date_unix' column exists
# Correlation heatmap
plt.figure(figsize=(10, 6))
numeric_df = df_cleaned.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# Histogram of Payload Mass
plt.figure(figsize=(8, 5))
sns.histplot(df_cleaned['static_fire_date_unix'], bins=20, kde=True)
plt.title("Distribution of Payload Mass - Static Fire Date")
plt.xlabel("Payload Mass (kg)")
plt.ylabel("Frequency")
plt.show()

# Histogram of Payload Mass
plt.figure(figsize=(8, 5))
sns.histplot(df_cleaned['flight_number'], bins=20, kde=True)
plt.title("Distribution of Payload Mass - Flight Number")
plt.xlabel("Payload Mass (kg)")
plt.ylabel("Frequency")
plt.show()

# Scatter Plot: static_fire_date_unix vs Landing Success
plt.figure(figsize=(8, 5))
# Assuming 'success' column represents the landing outcome
sns.scatterplot(x='static_fire_date_unix', y='success', data=df_cleaned)
plt.title("Static_fire_date_unix vs Landing Success")
plt.xlabel("Static_fire_date_unix")
plt.ylabel("Landing Success (1=Success, 0=Failure)")
plt.show()

Feature Selection & Splitting Dataset

In [None]:
# Define features (X) and target variable (y)
# Assuming 'success' column represents the landing outcome
X = df_cleaned.drop(columns=['success'])

# Drop non-numeric columns
X = X.select_dtypes(include=[np.number])

y = df_cleaned['success']

# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check the columns in df_cleaned
print(list(df_cleaned.columns)[:5])

Train Multiple Machine Learning Models

In [None]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "SVM": SVC(kernel='linear')
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

Hyperparameter Tuning (Grid Search CV)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for tuning
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Grid Search for Decision Tree Classifier
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from Grid Search
best_model = grid_search.best_estimator_
print("\nBest Decision Tree Model:", best_model)


Model Evaluation and Confusion Matrix

In [None]:
# Predict using the best model
y_pred_best = best_model.predict(X_test_scaled)

# Display Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Classification Report
report = classification_report(y_test, y_pred_best)
print("\n📌 Classification Report:\n", report)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred_best)
print(f"\n✅ Model Accuracy: {accuracy:.4f}")

# Precision, Recall, F1-Score
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)

print(f"🔹 Precision: {precision:.4f}")
print(f"🔹 Recall: {recall:.4f}")
print(f"🔹 F1 Score: {f1:.4f}")


: 