# **Predictive Modeling for Rocket Landing Success**  
### *A Machine Learning Approach Using SpaceX Falcon 9 Data* 

## **Data Wrangling**

### Import Libraries

In [1]:
import pandas as pd 
import numpy as np

### Load the SpaceX dataset data_falcon9.csv

In [2]:
# Define the file path
file_path = r'C:\Users\cjchavarria\Desktop\Rocket-Landing-Success-ML\01-data-collection-using-SpaceX-API\data_falcon9.csv'

In [3]:
try:
    # Load the dataset into a DataFrame with optimized memory usage
    df = pd.read_csv(file_path)
    print("Dataset successfully loaded!")
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except pd.errors.EmptyDataError:
    print("Error: The file is empty.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Dataset successfully loaded!


In [4]:
df.head(5)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
1,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
2,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857


### Calculate and display the percentage of missing values per attribute

In [5]:
def calculate_missing_percentage(df):
    #calculate the percentage of missing values
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # create a DataFrame for better readability
    missing_summary = missing_percentage.reset_index()
    missing_summary.columns = ['Attribute', 'MissingPercentage']

    # filter attributes with missing values and sort them in descending order
    missing_summary = missing_summary[missing_summary["MissingPercentage"] >0]
    missing_summary = missing_summary.sort_values(by = 'MissingPercentage', ascending = False).reset_index(drop = True)

    return missing_summary

# call the function and display the result
missing_data = calculate_missing_percentage(df)
if not missing_data.empty:
    print(missing_data)
else:
    print("No missing values found in the DataFrame")

    Attribute  MissingPercentage
0  LandingPad          28.888889


### Identify numerical and categorical columns

In [11]:
def classify_columns(df):
    # Separate columns based on their data types
    numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Create a DataFrame for numerical columns with their data types
    numerical_df = pd.DataFrame({
        'Column Name': numerical_columns,
        'Data Type': [df[col].dtype for col in numerical_columns]
    })

    # Create a DataFrame for categorical columns with their data types
    categorical_df = pd.DataFrame({
        'Column Name': categorical_columns,
        'Data Type': [df[col].dtype for col in categorical_columns]
    })

    return numerical_df, categorical_df

# Call the function and display the results
numerical_df, categorical_df = classify_columns(df)

print("Numerical DataFrame: ")
print(numerical_df)
print("\nCategorical DataFrame: ")
print(categorical_df)


Numerical DataFrame: 
    Column Name Data Type
0  FlightNumber     int64
1   PayloadMass   float64
2       Flights     int64
3         Block   float64
4   ReusedCount     int64
5     Longitude   float64
6      Latitude   float64

Categorical DataFrame: 
      Column Name Data Type
0            Date    object
1  BoosterVersion    object
2           Orbit    object
3      LaunchSite    object
4         Outcome    object
5      LandingPad    object
6          Serial    object


### Calculate the number of launches on each site
The dataset includes multiple SpaceX launch sites, specifically:

- **`CCAFS LC-40`: Cape Canaveral Space Launch Complex 40**
- **`VAFB SLC 4E`: Vandenberg Air Force Base Space Launch Complex 4E**
- **`KSC LC-39A`: Kennedy Space Center Launch Complex 39A**

Each launch's site is recorded in the `LaunchSite` column.

In [16]:
df['LaunchSite'].value_counts().to_frame().reset_index()

Unnamed: 0,LaunchSite,count
0,CCSFS SLC 40,55
1,KSC LC 39A,22
2,VAFB SLC 4E,13
