In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors



In [2]:
# Load the data
companies=pd.read_csv('Companies.csv')
config=pd.read_csv('Configs.csv')
families=pd.read_csv('Families.csv')
Launch_SFR=pd.read_csv('Launch SFR.csv')
Launches=pd.read_csv('Launches.csv')
Missons=pd.read_csv('Missions.csv')
Locations=pd.read_csv('Locations.csv')

### Understanding the Data

In [3]:
companies.isna().sum()
config.isna().sum()
# families.isna().sum()
# Launch_SFR.isna().sum()
# Launches.isna().sum()
# Missons.isna().sum()

Family Id             0
No                    0
Config                0
Status                0
Price               353
Liftoff Thrust      127
Payload to LEO      120
Payload to GTO      193
Stages                4
Strap-ons            17
Rocket Height        92
Fairing Diameter    176
Fairing Height      227
dtype: int64

In [4]:
config['Family Id'] = config['Family Id'].astype(int)
rockets = pd.merge(config, families, how = 'inner', on = 'Family Id')

all_files = pd.merge(Launches, Locations, how = 'inner', left_on = 'Location', right_on = 'Orig_Addr')
all_files = pd.merge(all_files, rockets, how = 'inner', right_on = 'Config', left_on = 'Rocket Name')
all_files = pd.merge(all_files, companies, how = 'inner', left_on = 'Rocket Organisation', right_on = 'Company Name')
#all_files = pd.merge(all_files, missions, how = 'inner', on = 'Launch Id')
all_files


Unnamed: 0,Launch Id,Launch Time,Launch Status,Launch Suborbital,Rocket Name,Rocket Organisation,Rocket Price,Rocket Payload to LEO,Location,Launch Year,...,Family,Missions,Successes,Partial Failures,Failures,Success Streak,Success Rate,Company Name,Company Country,Ownership
0,3,2018-05-22 19:47:00+00:00,Success,Orbital,Falcon 9 Block 4,SpaceX,62.0,22800.0,"SLC-4E, Vandenberg SFB, California, USA",2018,...,Falcon 9,138.0,135.0,1.0,2.0,109.0,98.2%,SpaceX,USA,Private
1,51,2017-10-09 12:37:00+00:00,Success,Orbital,Falcon 9 Block 4,SpaceX,62.0,22800.0,"SLC-4E, Vandenberg SFB, California, USA",2017,...,Falcon 9,138.0,135.0,1.0,2.0,109.0,98.2%,SpaceX,USA,Private
2,60,2018-03-30 14:13:00+00:00,Success,Orbital,Falcon 9 Block 4,SpaceX,62.0,22800.0,"SLC-4E, Vandenberg SFB, California, USA",2018,...,Falcon 9,138.0,135.0,1.0,2.0,109.0,98.2%,SpaceX,USA,Private
3,4,2018-06-04 04:45:00+00:00,Success,Orbital,Falcon 9 Block 4,SpaceX,62.0,22800.0,"SLC-40, Cape Canaveral SFS, Florida, USA",2018,...,Falcon 9,138.0,135.0,1.0,2.0,109.0,98.2%,SpaceX,USA,Private
4,6,2018-06-29 09:42:00+00:00,Success,Orbital,Falcon 9 Block 4,SpaceX,62.0,22800.0,"SLC-40, Cape Canaveral SFS, Florida, USA",2018,...,Falcon 9,138.0,135.0,1.0,2.0,109.0,98.2%,SpaceX,USA,Private
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,4167,1974-02-18 10:05:00+00:00,Success,Orbital,Scout D1,ASI,,185.0,"SM Launch Tab, San Marco Launch Platform, Kenya",1974,...,Scout,99.0,86.0,3.0,10.0,48.0,88.4%,ASI,Italy,State
6164,4168,1974-10-15 07:47:00+00:00,Success,Orbital,Scout B1,ASI,,129.0,"SM Launch Tab, San Marco Launch Platform, Kenya",1974,...,Scout,99.0,86.0,3.0,10.0,48.0,88.4%,ASI,Italy,State
6165,4169,1975-05-07 22:45:00+00:00,Success,Orbital,Scout F1,ASI,,193.0,"SM Launch Tab, San Marco Launch Platform, Kenya",1975,...,Scout,99.0,86.0,3.0,10.0,48.0,88.4%,ASI,Italy,State
6166,4170,1988-03-25 19:50:00+00:00,Success,Orbital,Scout G1,ASI,,210.0,"SM Launch Tab, San Marco Launch Platform, Kenya",1988,...,Scout,99.0,86.0,3.0,10.0,48.0,88.4%,ASI,Italy,State


In [5]:
all_files['Launch Status'] = all_files['Launch Status'].replace('Success', 1)
all_files['Launch Status'] = all_files['Launch Status'].replace('Failure', 0)
all_files['Launch Status'] = all_files['Launch Status'].replace('Partial Failure', 0)
all_files['Launch Status'] = all_files['Launch Status'].replace('Prelaunch Failure', 0)

d1=all_files["Launch Time"]

time=pd.to_datetime(d1).dt.year

In [6]:
# Step 1: Handle Date/Time Columns
all_files['Launch Time'] = pd.to_datetime(d1).dt.year
all_files['Launch Year'] = pd.to_datetime(d1).dt.year
all_files['Launch Month'] = pd.to_datetime(d1).dt.month
all_files['Launch Day'] = pd.to_datetime(d1).dt.day

# Fill missing date/time information using forward fill
all_files['Launch Year'].fillna(method='ffill', inplace=True)
all_files['Launch Month'].fillna(method='ffill', inplace=True)
all_files['Launch Day'].fillna(method='ffill', inplace=True)

# Drop original 'Launch Time' if it's no longer needed
all_files.drop(columns=['Launch Time'], inplace=True)

# Step 2: Handle Categorical Data
# Handle categorical data
categorical_columns = ['Launch Status', 'Rocket Name', 'Rocket Organisation', 'Location', 
                       'Orig_Addr', 'Country', 'Country_Code', 'Operator', 'Launch Site', 
                       'Comb Launch Site', 'Family', 'Company Name', 'Company Country', 'Ownership', 
                       'Launch Year Mon', 'Config', 'Status', 'Price', 'Liftoff Thrust', 
                       'Payload to LEO', 'Payload to GTO', 'Rocket Height', 'Fairing Diameter', 
                       'Fairing Height', 'Success Rate','Launch']

# label_encoders = {}
# for column in categorical_columns:
#     if column in all_files.columns:
#         all_files[column] = all_files[column].astype(str)  # Convert to string for LabelEncoder
#         label_encoders[column] = LabelEncoder()
#         all_files[column] = label_encoders[column].fit_transform(all_files[column])

# Separate numerical and categorical data
numerical_data = all_files.select_dtypes(include=[np.number])
categorical_data = all_files.select_dtypes(include=[object])

# Impute categorical data with SimpleImputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_data_imputed = pd.DataFrame(categorical_imputer.fit_transform(categorical_data), columns=categorical_data.columns)

# Impute numerical data with SimpleImputer
numerical_imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = pd.DataFrame(numerical_imputer.fit_transform(numerical_data), columns=numerical_data.columns)

# Scale the numerical data
scaler = MinMaxScaler()  # Choose your scaler here
numerical_data_scaled = pd.DataFrame(scaler.fit_transform(numerical_data_imputed), columns=numerical_data_imputed.columns)

# Combine the imputed and scaled numerical data with categorical data
df_imputed = pd.concat([numerical_data_scaled, categorical_data_imputed], axis=1)

print(df_imputed.isna().sum())
df_imputed.dtypes


Launch Id                     0
Launch Status                 0
Rocket Price                  0
Rocket Payload to LEO         0
Launch Year                   0
USD/kg to LEO                 0
2021 Mult                     0
USD/kg to LEO CPI Adjusted    0
Rocket Price CPI Adjusted     0
Dum                           0
Lat                           0
Lon                           0
Launch Site Lat               0
Launch Site Lon               0
Comb Launch Site Lat          0
Comb Launch Site Lon          0
Operator Lat                  0
Operator Lon                  0
Family Id                     0
No                            0
Stages                        0
Strap-ons                     0
Missions                      0
Successes                     0
Partial Failures              0
Failures                      0
Success Streak                0
Launch Month                  0
Launch Day                    0
Launch Suborbital             0
Rocket Name                   0
Rocket O

Launch Id                     float64
Launch Status                 float64
Rocket Price                  float64
Rocket Payload to LEO         float64
Launch Year                   float64
USD/kg to LEO                 float64
2021 Mult                     float64
USD/kg to LEO CPI Adjusted    float64
Rocket Price CPI Adjusted     float64
Dum                           float64
Lat                           float64
Lon                           float64
Launch Site Lat               float64
Launch Site Lon               float64
Comb Launch Site Lat          float64
Comb Launch Site Lon          float64
Operator Lat                  float64
Operator Lon                  float64
Family Id                     float64
No                            float64
Stages                        float64
Strap-ons                     float64
Missions                      float64
Successes                     float64
Partial Failures              float64
Failures                      float64
Success Stre

In [7]:
# df_imputed.drop(columns=['Launch Id'], inplace=True)
# df_imputed.drop(columns=['Launch Year Mon','Launch Year','Launch Month','Launch Day','Launch Status'], inplace=True)
# for i in range(0,df_imputed.shape[0]):
#     df_imputed["Launch Id"][i]=i
# df_imputed['Launch Id'].astype(int)

df_imputed.dtypes
df_imputed.to_csv('final_data.csv',index=False)

In [8]:
# Step 1: Create the User-Item Matrix
user_item_matrix = all_files.pivot_table(
    index=['Rocket Price', 'Payload to LEO', 'Launch Year', 'Operator', 'Location'],
    columns=['Launch Id', 'Rocket Name', 'Country'],
    values='Launch Status',
    aggfunc='mean',  # Aggregating launch status (e.g., mean success rate)
    fill_value=0
)

# Step 2: Train-Test Split
train_data=user_item_matrix


# Flatten the multi-index for fitting the KNN model
train_data_flat = train_data.reset_index().set_index(['Rocket Price', 'Payload to LEO', 'Launch Year', 'Operator', 'Location'])

# Fit the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(train_data_flat)


# Find the nearest neighbors for each item in the dataset
distances, indices = knn.kneighbors(train_data_flat, n_neighbors=20)

# Transform distances into similarity scores (1 - distance)
item_similarity = 1 - distances
# print("item_similarity")
# print(item_similarity)


def recommend_for_user(Rocket_Price, Payload_to_LEO, Launch_Year, Operator, Location, train_data, indices, top_n=5):
    # Flatten the multi-index for locating the user
    train_data_flat = train_data.reset_index().set_index(['Rocket Price', 'Payload to LEO', 'Launch Year', 'Operator', 'Location'])
    
    # Get the user's index
    user_idx = train_data_flat.index.get_loc((Rocket_Price, Payload_to_LEO, Launch_Year, Operator, Location))
    
    # Get the indices of the most similar items
    recommended_indices = indices[user_idx][:top_n]
    
    # Get the names of the recommended items
    recommended_items = train_data_flat.index[recommended_indices]
    return recommended_items

Rocket_Price = train_data.index.get_level_values('Rocket Price')[1]
Payload_to_LEO = train_data.index.get_level_values('Payload to LEO')[1]
Launch_Year = train_data.index.get_level_values('Launch Year')[1]
Operator = train_data.index.get_level_values('Operator')[1]
Location = train_data.index.get_level_values('Location')[1]

recommendations = recommend_for_user(Rocket_Price, Payload_to_LEO, Launch_Year, Operator, Location, train_data, indices, top_n=5)

# Display recommendations
# print(f"Top 5 recommendations for user {launch_id}, rocket {rocket_name}, country {country}: {recommendations}")
print(f"Top 5 recommendations for rocket price {Rocket_Price}, Payload to LEO {Payload_to_LEO}, Launch Year {Launch_Year}, Operator {Operator}, Location {Location} : {recommendations}")


Top 5 recommendations for rocket price 2.5, Payload to LEO 204 kg, Launch Year 2021, Operator United States Space Force, Location LP-3B, Pacific Spaceport Complex, Kodiak, Alaska, USA : MultiIndex([(  2.5,   '204 kg', 2021, ...),
            (64.68, '4,200 kg', 2018, ...),
            (64.68, '4,200 kg', 2002, ...),
            (64.68, '4,200 kg', 2003, ...),
            (64.68, '4,200 kg', 2008, ...)],
           names=['Rocket Price', 'Payload to LEO', 'Launch Year', 'Operator', 'Location'])


In [9]:
for i in recommendations:
    for j in i:
        print(j)
    

2.5
204 kg
2021
United States Space Force
LP-3B, Pacific Spaceport Complex, Kodiak, Alaska, USA
64.68
4,200 kg
2018
China National Space Administration
LC-9, Taiyuan Satellite Launch Center, China
64.68
4,200 kg
2002
China National Space Administration
LC-7, Taiyuan Satellite Launch Center, China
64.68
4,200 kg
2003
China National Space Administration
LC-7, Taiyuan Satellite Launch Center, China
64.68
4,200 kg
2008
China National Space Administration
LC-9, Taiyuan Satellite Launch Center, China
