In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
dataset = pd.read_excel('RocketLaunchDataCompleted.xlsx')

## Data Cleaning

In [3]:
dataset.drop(['Name', 'Date', 'Time (East Coast)',
             'Location', 'Hist Ave Max Wind Speed', 'Hist Ave Visibility',
             'Hist Ave Sea Level Pressure', 'Day Length',
             'Notes', 'Sea Level Pressure'], axis=1, inplace=True)

In [4]:
dataset

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
0,,,75.0,68.0,71.00,,75.0,55.0,65.0,0.00,0.08,E,16.0,15.0,,Cloudy
1,,,78.0,70.0,73.39,,75.0,55.0,65.0,0.00,0.09,E,14.0,10.0,,Cloudy
2,Uncrewed,Y,73.0,0.0,60.21,62.0,75.0,55.0,65.0,0.00,0.09,NE,15.0,10.0,11.0,Cloudy
3,,,76.0,57.0,66.04,,75.0,55.0,65.0,0.00,0.08,N,10.0,10.0,,Partly Cloudy
4,,,79.0,60.0,70.52,,75.0,55.0,65.0,0.00,0.09,E,12.0,10.0,,Partly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,,,87.0,71.0,79.36,,87.0,70.0,79.0,0.00,0.15,S,20.0,10.0,,Thunder
296,,,86.0,72.0,79.50,,88.0,70.0,79.0,0.01,0.16,SE,18.0,10.0,,Fair
297,Crewed,Y,87.0,75.0,79.69,80.0,88.0,70.0,79.0,0.00,0.16,SE,16.0,10.0,7.0,Fair
298,,,87.0,72.0,79.70,,88.0,70.0,79.0,0.46,0.16,E,13.0,10.0,,Cloudy


In [5]:
# filling NAN values with Uncrewed
set(dataset['Crewed or Uncrewed'])

{'Crewed', 'Uncrewed', nan}

In [6]:
# replacing NAN with uncrewed
dataset['Crewed or Uncrewed'].fillna('Uncrewed', inplace=True)

In [7]:
# replacing NAN with N i.e. unsuccessfull
dataset['Launched?'].fillna('N', inplace=True)

In [8]:
dataset['Temp at Launch Time'].isnull().sum()

241

In [9]:
dataset['Temp at Launch Time'].describe()

count    59.000000
mean     75.101695
std      10.471134
min      50.000000
25%      70.000000
50%      77.000000
75%      81.500000
max      98.000000
Name: Temp at Launch Time, dtype: float64

In [10]:
dataset['Temp at Launch Time'].fillna(dataset['Temp at Launch Time'].mean(), inplace=True)

In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            300 non-null    object 
 1   Launched?                     300 non-null    object 
 2   High Temp                     299 non-null    float64
 3   Low Temp                      299 non-null    float64
 4   Ave Temp                      299 non-null    float64
 5   Temp at Launch Time           300 non-null    float64
 6   Hist High Temp                299 non-null    float64
 7   Hist Low Temp                 299 non-null    float64
 8   Hist Ave Temp                 299 non-null    float64
 9   Percipitation at Launch Time  299 non-null    float64
 10  Hist Ave Percipitation        299 non-null    float64
 11  Wind Direction                299 non-null    object 
 12  Max Wind Speed                299 non-null    float64
 13  Visib

In [12]:
# we notice that one entry is completely useless for us
dataset[dataset['High Temp'].isnull()]

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
142,Uncrewed,Y,,,,75.101695,,,,,,,,,,


In [13]:
dataset.drop(142, inplace=True)

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            299 non-null    object 
 1   Launched?                     299 non-null    object 
 2   High Temp                     299 non-null    float64
 3   Low Temp                      299 non-null    float64
 4   Ave Temp                      299 non-null    float64
 5   Temp at Launch Time           299 non-null    float64
 6   Hist High Temp                299 non-null    float64
 7   Hist Low Temp                 299 non-null    float64
 8   Hist Ave Temp                 299 non-null    float64
 9   Percipitation at Launch Time  299 non-null    float64
 10  Hist Ave Percipitation        299 non-null    float64
 11  Wind Direction                299 non-null    object 
 12  Max Wind Speed                299 non-null    float64
 13  Visib

In [15]:
dataset['Wind Speed at Launch Time'].isna().sum()

240

In [16]:
# deciding the approx vlue to fill the NULL entries
set(dataset['Wind Speed at Launch Time'])

{nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 4.0,
 5.0,
 9.0,
 11.0,
 12.0,
 nan,
 nan,
 nan,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 19.0,
 25.0,
 26.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 2.0,
 6.0,
 7.0,
 8.0,
 10.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,

In [17]:
dataset['Wind Speed at Launch Time'].mean()

10.59322033898305

In [18]:
dataset['Wind Speed at Launch Time'].fillna(dataset['Wind Speed at Launch Time'].mean(), inplace=True)

In [19]:
dataset[dataset['Condition'].isna()]

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
78,Uncrewed,N,89.0,0.0,75.21,75.101695,89.0,73.0,80.0,0.0,0.24,E,17.0,10.0,10.59322,


In [20]:
dataset['Condition'].fillna('Fair', inplace=True)

### Completely Clean data

In [21]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            299 non-null    object 
 1   Launched?                     299 non-null    object 
 2   High Temp                     299 non-null    float64
 3   Low Temp                      299 non-null    float64
 4   Ave Temp                      299 non-null    float64
 5   Temp at Launch Time           299 non-null    float64
 6   Hist High Temp                299 non-null    float64
 7   Hist Low Temp                 299 non-null    float64
 8   Hist Ave Temp                 299 non-null    float64
 9   Percipitation at Launch Time  299 non-null    float64
 10  Hist Ave Percipitation        299 non-null    float64
 11  Wind Direction                299 non-null    object 
 12  Max Wind Speed                299 non-null    float64
 13  Visib

In [22]:
dataset.head()

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
0,Uncrewed,N,75.0,68.0,71.0,75.101695,75.0,55.0,65.0,0.0,0.08,E,16.0,15.0,10.59322,Cloudy
1,Uncrewed,N,78.0,70.0,73.39,75.101695,75.0,55.0,65.0,0.0,0.09,E,14.0,10.0,10.59322,Cloudy
2,Uncrewed,Y,73.0,0.0,60.21,62.0,75.0,55.0,65.0,0.0,0.09,NE,15.0,10.0,11.0,Cloudy
3,Uncrewed,N,76.0,57.0,66.04,75.101695,75.0,55.0,65.0,0.0,0.08,N,10.0,10.0,10.59322,Partly Cloudy
4,Uncrewed,N,79.0,60.0,70.52,75.101695,75.0,55.0,65.0,0.0,0.09,E,12.0,10.0,10.59322,Partly Cloudy


### Converting String into Numerical Values

In [23]:
# Crewed or Uncrewed,
# Wind Direction,
# Condition.

# need to change all this into numerical data

label_encoder_crew_status = LabelEncoder()
label_encoder_wind_dir = LabelEncoder()
label_encoder_condition = LabelEncoder()
label_encoder_launched = LabelEncoder()

dataset['Crewed or Uncrewed'] = label_encoder_crew_status.fit_transform(dataset['Crewed or Uncrewed'])
dataset['Wind Direction'] = label_encoder_wind_dir.fit_transform(dataset['Wind Direction'])
dataset['Condition'] = label_encoder_condition.fit_transform(dataset['Condition'])
dataset['Launched?'] = label_encoder_launched.fit_transform(dataset['Launched?'])

In [24]:
label_encoder_crew_status.classes_

array(['Crewed', 'Uncrewed'], dtype=object)

In [25]:
label_encoder_wind_dir.classes_

array(['E', 'N', 'NE', 'NW', 'S', 'SE', 'SW', 'W'], dtype=object)

In [26]:
label_encoder_condition.classes_

array(['Cloudy', 'Fair', 'Heavy T-Storm', 'Light Rain', 'Mostly Cloudy',
       'Partly Cloudly', 'Partly Cloudy', 'Rain', 'T-Storm', 'Thunder',
       'Windy'], dtype=object)

In [27]:
label_encoder_launched.classes_

array(['N', 'Y'], dtype=object)

In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            299 non-null    int64  
 1   Launched?                     299 non-null    int64  
 2   High Temp                     299 non-null    float64
 3   Low Temp                      299 non-null    float64
 4   Ave Temp                      299 non-null    float64
 5   Temp at Launch Time           299 non-null    float64
 6   Hist High Temp                299 non-null    float64
 7   Hist Low Temp                 299 non-null    float64
 8   Hist Ave Temp                 299 non-null    float64
 9   Percipitation at Launch Time  299 non-null    float64
 10  Hist Ave Percipitation        299 non-null    float64
 11  Wind Direction                299 non-null    int64  
 12  Max Wind Speed                299 non-null    float64
 13  Visib

In [29]:
dataset.head()

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
0,1,0,75.0,68.0,71.0,75.101695,75.0,55.0,65.0,0.0,0.08,0,16.0,15.0,10.59322,0
1,1,0,78.0,70.0,73.39,75.101695,75.0,55.0,65.0,0.0,0.09,0,14.0,10.0,10.59322,0
2,1,1,73.0,0.0,60.21,62.0,75.0,55.0,65.0,0.0,0.09,2,15.0,10.0,11.0,0
3,1,0,76.0,57.0,66.04,75.101695,75.0,55.0,65.0,0.0,0.08,1,10.0,10.0,10.59322,6
4,1,0,79.0,60.0,70.52,75.101695,75.0,55.0,65.0,0.0,0.09,0,12.0,10.0,10.59322,6


### Train and Test split for our data

In [30]:
Y = dataset.iloc[:, 1]
Y.shape

(299,)

In [31]:
dataset.drop('Launched?', axis=1, inplace=True)

In [32]:
X = dataset
X.shape

(299, 15)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [34]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((239, 15), (60, 15), (239,), (60,))

## Training the Model

### Random Forest

In [35]:
clf = RandomForestClassifier()
parameters = {'n_estimators':[200, 400, 600],
             'max_depth': [30, 40, 50, 60, 70, 80]}

RandomForest = GridSearchCV(clf, parameters)
RandomForest.fit(x_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [30, 40, 50, 60, 70, 80],
                         'n_estimators': [200, 400, 600]})

In [36]:
RandomForest.best_score_

0.9957446808510639

In [37]:
RandomForest.best_params_

{'max_depth': 30, 'n_estimators': 200}

In [38]:
clf = RandomForestClassifier(max_depth= 30, n_estimators= 200)

In [39]:
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=30, n_estimators=200)

In [40]:
y_pred = clf.predict(x_test)

In [41]:
accuracy_score(y_test, y_pred)

0.9833333333333333

In [42]:
confusion_matrix(y_test, y_pred)

array([[50,  1],
       [ 0,  9]])

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.90      1.00      0.95         9

    accuracy                           0.98        60
   macro avg       0.95      0.99      0.97        60
weighted avg       0.98      0.98      0.98        60



### KNN

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
clf1 = KNeighborsClassifier()
parameters = {'n_neighbors':[5, 7, 9]}

knn = GridSearchCV(clf1, parameters)
knn.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 7, 9]})

In [46]:
knn.best_params_

{'n_neighbors': 5}

In [47]:
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(x_train, y_train)

KNeighborsClassifier()

In [48]:
y_pred_knn = clf_knn.predict(x_test)

In [49]:
accuracy_score(y_test, y_pred_knn)

0.9

In [50]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        51
           1       1.00      0.33      0.50         9

    accuracy                           0.90        60
   macro avg       0.95      0.67      0.72        60
weighted avg       0.91      0.90      0.88        60

