In [30]:
# Pre-processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer




In [20]:
df = pd.read_csv('koi_dataset.csv')
print(df.head())


   rowid     kepid kepoi_name   kepler_name koi_disposition koi_vet_stat  \
0      1  10797460  K00752.01  Kepler-227 b       CONFIRMED         Done   
1      2  10797460  K00752.02  Kepler-227 c       CONFIRMED         Done   
2      3  10811496  K00753.01           NaN       CANDIDATE         Done   
3      4  10848459  K00754.01           NaN  FALSE POSITIVE         Done   
4      5  10854555  K00755.01  Kepler-664 b       CONFIRMED         Done   

  koi_vet_date koi_pdisposition  koi_score  koi_fpflag_nt  ...  koi_fwm_srao  \
0   2018-08-16        CANDIDATE      1.000              0  ...         0.430   
1   2018-08-16        CANDIDATE      0.969              0  ...        -0.630   
2   2018-08-16        CANDIDATE      0.000              0  ...        -0.021   
3   2018-08-16   FALSE POSITIVE      0.000              0  ...        -0.111   
4   2018-08-16        CANDIDATE      1.000              0  ...        -0.010   

   koi_fwm_sdeco  koi_fwm_prao koi_fwm_pdeco koi_dicco_mra  ko

In [21]:
null_counts = df.isnull().sum()


# Write the results to a new file
with open('null_values.txt', 'w') as file:
    file.write("Column Name\tNull Count\n")
    for column, count in null_counts.items():
        file.write(f"{column}\t{count}\n")


In [22]:
# List of columns to drop
columns_to_drop = ["kepler_name", "koi_comment", "koi_longp", "koi_model_dof", "koi_model_chisq", "koi_sage", "koi_ingress", "kepoi_name", "koi_vet_date", "koi_limbdark_mod", "koi_parm_prov", "koi_tce_delivname", "koi_sparprov", "koi_datalink_dvr", "koi_datalink_dvs", "koi_quarters", "koi_trans_mod"]

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)


In [23]:
X = df.drop(columns=['koi_disposition'])  
y = df['koi_disposition']  

In [24]:
types_data = df.dtypes

# Write the results to a new file
with open('data_types.txt', 'w') as file:
    file.write("Column Name\tData Type\n")
    for column, count in types_data.items():
        file.write(f"{column}\t{count}\n")

In [25]:
types_desc = df.describe()

# Write the results to a new file
with open('data_description.txt', 'w') as file:
    file.write("Column Name\tData description\n")
    for column, count in types_data.items():
        file.write(f"{column}\t{count}\n")

In [26]:
object_columns = X.select_dtypes(include=['object']).columns

# Perform one-hot encoding for object columns
df_encoded = pd.get_dummies(X, columns=object_columns)
types_data2 = df_encoded.dtypes

# Write the results to a new file
with open('data_types2.txt', 'w') as file:
    file.write("Column Name\tData Type\n")
    for column, count in types_data2.items():
        file.write(f"{column}\t{count}\n")

In [29]:
if not os.path.exists('box_plots'):
    os.makedirs('box_plots')

# Iterate over each numerical feature
for column in df_encoded.select_dtypes(include='number').columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df_encoded[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.ylabel('Values')
    
    plt.savefig(f'box_plots/{column}_boxplot.png')
    
    plt.close()
    

In [39]:
from sklearn.impute import SimpleImputer

# Initialize SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# Define columns with missing values
columns_with_missing_values = df_encoded.columns[df_encoded.isnull().any()].tolist()

# Check if there are columns with missing values
if not columns_with_missing_values:
    print("No columns with missing values found.")
    df_encoded['koi_disposition'] = df["koi_disposition"]
    df_encoded.to_csv('imputed_data.csv', index=False)
else:
    # Extract data for columns with missing values
    data_to_impute = df_encoded[columns_with_missing_values].values

    # Perform imputation on columns with missing values
    imputed_data = imputer.fit_transform(data_to_impute)

    # Create a DataFrame from the imputed data
    df_imputed = pd.DataFrame(imputed_data, columns=columns_with_missing_values, index=df_encoded.index)

    # Replace missing values in the original DataFrame with imputed values
    df_encoded[columns_with_missing_values] = df_imputed

    # Add target variable to the imputed DataFrame
    df_imputed['koi_disposition'] = df_encoded['koi_disposition']

    # Save the imputed DataFrame to a new file
    df_encoded.to_csv('imputed_data.csv', index=False)


No columns with missing values found.


In [38]:
null_counts_en = df_encoded.isnull().sum()


# Write the results to a new file
with open('null_values_encoded.txt', 'w') as file:
    file.write("Column Name\tNull Count\n")
    for column, count in null_counts_en.items():
        file.write(f"{column}\t{count}\n")