# Create an ML algorithm to classify the planets as Candidate/False positive/Confirmed etc based on the  column “koi_disposition”.


In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load data using csv
kepler_data = pd.read_csv("kepler_data-2.csv")

print("Columns present in the dataframe:")
print(kepler_data.columns)

# Droping the unnecessary columns
kepler_data.drop(columns=['kepoi_name', 'koi_teq_err1', 'koi_teq_err2'], inplace=True)


X = kepler_data.drop(columns=['koi_disposition'])
y = kepler_data['koi_disposition']

# Spliting train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#preprocessing of data
numeric_columns = X.select_dtypes(include=['float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# fitting of traning data
X_train_numeric = numeric_transformer.fit_transform(X_train[numeric_columns])
X_test_numeric = numeric_transformer.transform(X_test[numeric_columns])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

X_train_categorical = categorical_transformer.fit_transform(X_train[categorical_columns])
X_test_categorical = categorical_transformer.transform(X_test[categorical_columns])
X_train_processed = np.hstack((X_train_numeric, X_train_categorical.toarray()))
X_test_processed = np.hstack((X_test_numeric, X_test_categorical.toarray()))

# Define the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_processed, y_train)

y_pred = model.predict(X_test_processed)

# Accuracy Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Columns present in the dataframe:
Index(['kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
       'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
       'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
       'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
       'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')
Accuracy: 0.9843178254051228


In [95]:
# Check unique values in 'koi_pdisposition'
print(kepler_df['koi_pdisposition'].unique())


['CANDIDATE' 'FALSE POSITIVE']


In [40]:
# Print unique values for each column
for column in kepler_data.columns:
    unique_values = kepler_data[column].unique()
    print(f"Unique values for column '{column}': {unique_values}")


Unique values for column 'kepid': [10797460 10811496 10848459 ... 10147276 10155286 10156110]
Unique values for column 'kepoi_name': ['K00752.01' 'K00752.02' 'K00753.01' ... 'K07987.01' 'K07988.01'
 'K07989.01']
Unique values for column 'koi_disposition': [ 1  0 -1]
Unique values for column 'koi_pdisposition': ['CANDIDATE' 'FALSE POSITIVE']
Unique values for column 'koi_score': [1.        0.969     0.        0.992     0.811     0.998     0.98
 0.971     0.4808294 0.978     0.014     0.999     0.993     0.871
 0.773     0.989     0.952     0.994     0.053     0.95      0.745
 0.99      0.995     0.037     0.006     0.996     0.997     0.878
 0.876     0.985     0.942     0.912     0.974     0.959     0.987
 0.983     0.573     0.986     0.635     0.966     0.415     0.228
 0.957     0.92      0.973     0.975     0.953     0.704     0.949
 0.965     0.632     0.752     0.945     0.765     0.815     0.931
 0.711     0.881     0.758     0.001     0.848     0.884     0.557
 0.008     0.976 

In [66]:
# Print columns present in the dataframe
print("Columns present in the dataframe:")
print(kepler_data.columns)


Columns present in the dataframe:
Index(['kepid', 'kepoi_name', 'koi_disposition', 'koi_pdisposition',
       'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_tce_delivname', 'koi_steff',
       'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1',
       'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra',
       'dec', 'koi_kepmag'],
      dtype='object')


# 1.Why did you choose the particular algorithm?


# 2.What are the different tuning methods used for the algorithm?


# 3. Did you consider any other choice of algorithm?Why or why not?


In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the Data
csv_file_path = 'kepler_data-2.csv'
data = pd.read_csv(csv_file_path)

# Replace missing values with 'None'
data.fillna('None', inplace=True)

# Check if there are any samples left after preprocessing
if data.empty:
    print("Error: No samples left after preprocessing.")
    exit()

# Encoding categorical variables
label_encoder = LabelEncoder()
data['koi_disposition'] = label_encoder.fit_transform(data['koi_disposition'])

# Perform one-hot encoding for categorical features
categorical_cols = ['kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_tce_delivname']
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, one_hot_encoded], axis=1)
    data.drop(col, axis=1, inplace=True)

# Convert 'None' values to a numeric value
data.replace('None', -1, inplace=True)

# Step 2: Split the Data
X = data.drop(columns=['koi_disposition'])
y = data['koi_disposition']

# Step 3: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Selection and Training
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Step 5: Model Evaluation
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print accuracy percentage
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Accuracy percentage: {:.2f}%".format(accuracy * 100))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       484
           1       1.00      1.00      1.00       490
           2       1.00      1.00      1.00       939

    accuracy                           1.00      1913
   macro avg       1.00      1.00      1.00      1913
weighted avg       1.00      1.00      1.00      1913

Accuracy: 0.9989545216936748
Accuracy percentage: 99.90%


# 4.What is the accuracy?

# 5. What are the different types of metrics that can be used to evaluate the model?