In [None]:

!pip install pandas scikit-learn numpy matplotlib seaborn -q

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

data_file_path = 'spacex.csv'
try:
    df = pd.read_csv(data_file_path)
    print(f"Dataset '{data_file_path}' loaded successfully from Colab session storage.")
except FileNotFoundError:
    print(f"CRITICAL ERROR: '{data_file_path}' not found in Colab session storage.")
    print("Please ensure you have manually uploaded 'spacex.csv' to the Colab files tab (folder icon on left sidebar).")
    print("--- Creating a small DUMMY DataFrame for demonstration as a LAST RESORT FALLBACK ---")

    df = pd.DataFrame({
        'FlightNumber': range(1, 11),
        'PayloadMass': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
        'Orbit': ['LEO', 'GTO', 'LEO', 'ISS', 'GTO', 'LEO', 'ISS', 'GTO', 'LEO', 'ISS'],
        'LaunchSite': ['CCSFS SLC 40', 'KSC LC 39A', 'VAFB SLC 4E'] * 3 + ['CCSFS SLC 40'],
        'Flights': [1]*10,
        'GridFins': [True]*5 + [False]*5,
        'Reused': [False, True]*5,
        'Legs': [True]*8 + [False]*2,
        'LandingPad': ['5e9e3032383ecb267a34e7c7', '5e9e3032383ecb6bb234e7ca'] * 5,
        'Block': [1.0, 2.0, 3.0, 4.0, 5.0]*2,
        'ReusedCount': [0, 1]*5,
        'Class': [1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
    })

if 'Outcome' in df.columns and 'Class' not in df.columns:
    df['Class'] = df['Outcome'].apply(lambda x: 1 if 'True' in str(x) else 0)
    print("Created 'Class' column from 'Outcome' column.")
elif 'Class' in df.columns:
    print("'Class' column already exists.")
else:
    print("Neither 'Outcome' nor 'Class' column found. Falling back to dummy data assumptions.")



features = df[['PayloadMass', 'Orbit', 'LaunchSite', 'GridFins', 'Legs', 'Reused', 'Class']]

features['PayloadMass'].fillna(features['PayloadMass'].mean(), inplace=True)


features_one_hot = pd.get_dummies(features, columns=['Orbit', 'LaunchSite'], drop_first=True)

X = features_one_hot.drop('Class', axis=1)
y = features_one_hot['Class']

print(f"\nData preparation complete.")
print(f"Final prepared features shape (X): {X.shape}")
print(f"Target variable shape (y): {y.shape}")
print("\nFirst 5 rows of the prepared features (X) for modeling:")
print(X.head())


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)
print("Random Forest Model training complete.")

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Preliminary Test Accuracy: {accuracy:.4f}")

print("\n--- Miniature Model Execution Complete ---")
print("This output confirms successful data loading, essential preprocessing, and initial model training.")
print("This serves as a solid foundation. Further steps in the full MSc project will include extensive "
      "feature engineering, exploring various model optimizations, and deeper analytical insights.")

Dataset 'spacex.csv' loaded successfully from Colab session storage.
Created 'Class' column from 'Outcome' column.

Data preparation complete.
Final prepared features shape (X): (90, 16)
Target variable shape (y): (90,)

First 5 rows of the prepared features (X) for modeling:
   PayloadMass  GridFins   Legs  Reused  Orbit_GEO  Orbit_GTO  Orbit_HEO  \
0  6123.547647     False  False   False      False      False      False   
1   525.000000     False  False   False      False      False      False   
2   677.000000     False  False   False      False      False      False   
3   500.000000     False  False   False      False      False      False   
4  3170.000000     False  False   False      False       True      False   

   Orbit_ISS  Orbit_LEO  Orbit_MEO  Orbit_PO  Orbit_SO  Orbit_SSO  Orbit_VLEO  \
0      False       True      False     False     False      False       False   
1      False       True      False     False     False      False       False   
2       True      False

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features['PayloadMass'].fillna(features['PayloadMass'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['PayloadMass'].fillna(features['PayloadMass'].mean(), inplace=True)
