In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
# Read in data from NASA exoplenet archive 
df = pd.read_csv(r"C:\Users\julia\Downloads\csv_main.csv", skiprows=78)

In [5]:
def multiple_linear_regression_analysis(df, predictor_vars, response_var, x_labels, y_label, log=False):
    """
    Performs multiple linear regression analysis between multiple predictor variables and a response variable 
    in a DataFrame with custom axis labels.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    predictor_vars: List of column names to be used as predictor variables.
    response_var: The name of the column to be used as the response variable.
    x_labels: Labels for the x-axes (for each predictor variable).
    y_label: Label for the y-axis (response variable).
    log : Whether to use logarithmic scale for the plot.
    """
    if len(predictor_vars) != len(x_labels):
        raise ValueError("The number of predictor variables and x-labels must be the same.")

    # Filtering out rows with NaN values in the specified columns
    df_filtered = df.dropna(subset=predictor_vars + [response_var])
    X = df_filtered[predictor_vars]
    y = df_filtered[response_var]  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Creating a multiple linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Displaying the results
    print("Multiple Linear Regression Analysis")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)
  
    
def polynomial_regression_analysis(df, predictor_vars, response_var, degree, x_labels, y_label, log=False):
    if len(predictor_vars) != len(x_labels):
        raise ValueError("Number of predictors and x-labels must be the same.")

    # Filtering out rows with NaN values
    df_filtered = df.dropna(subset=predictor_vars + [response_var])
    X = df_filtered[predictor_vars] 
    y = df_filtered[response_var] 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X_train)
    model_poly = LinearRegression()
    model_poly.fit(X_poly, y_train)
    X_test_poly = poly.transform(X_test)
    y_pred_poly = model_poly.predict(X_test_poly)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)
    
    # Print results
    print(f"Polynomial Regression (Degree = {degree}):")
    print("Mean Squared Error:", mse_poly)
    print("R-squared:", r2_poly)
             
        
def random_forest_regression_analysis(df, predictor_vars, response_var, x_labels, y_label, n_estimators=100, show_feature_importance=False, log=False):
    if len(predictor_vars) != len(x_labels):
        raise ValueError("Number of predictors and x-labels must be the same.")

    # Filtering out rows with NaN values
    df_filtered = df.dropna(subset=predictor_vars + [response_var])
    X = df_filtered[predictor_vars]  
    y = df_filtered[response_var]  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)

    # Displaying the results
    print("Random Forest Regression:")
    print("Mean Squared Error:", mse_rf)
    print("R-squared:", r2_rf)

    # Feature importance can be shown
    if show_feature_importance:
        feature_importances = rf_model.feature_importances_
        plt.bar(x_labels, feature_importances)
        plt.title('Feature Importances')
        plt.show()
        
def gradient_boosting_regression_analysis(df, predictor_vars, response_var, x_labels, y_label, n_estimators=100, learning_rate=0.1, log=False):
    if len(predictor_vars) != len(x_labels):
        raise ValueError("Number of predictors and x-labels must be the same.")

    # Filtering out rows with NaN values
    df_filtered = df.dropna(subset=predictor_vars + [response_var])
    X = df_filtered[predictor_vars]  
    y = df_filtered[response_var]  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Creating a gradient boosting model
    gb_model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    mse_gb = mean_squared_error(y_test, y_pred_gb)
    r2_gb = r2_score(y_test, y_pred_gb)

    # Displaying the results
    print("Gradient Boosting Regression:")
    print("Mean Squared Error:", mse_gb)
    print("R-squared:", r2_gb)
    
def neural_network_regression(df, predictor_vars, response_var, x_labels, y_label, log=False):
    if len(predictor_vars) != len(x_labels):
        raise ValueError("Number of predictors and x-labels must be the same.")

    # Filtering out rows with NaN values
    df_filtered = df.dropna(subset=predictor_vars + [response_var])
    X = df_filtered[predictor_vars]
    y = df_filtered[response_var]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Creating a neural network model
    nn_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=10000, random_state=0)
    nn_model.fit(X_train, y_train)
    y_pred_nn = nn_model.predict(X_test)
    mse_nn = mean_squared_error(y_test, y_pred_nn)
    r2_nn = r2_score(y_test, y_pred_nn)

    # Displaying the results
    print("Neural Network Regression:")
    print("Mean Squared Error:", mse_nn)
    print("R-squared:", r2_nn)

In [30]:
# Predictor Variables: Radius, Semi-Major Axis
# Response Variable: Mass

multiple_linear_regression_analysis(df, ['pl_rade', 'pl_orbsmax'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis'], 'Planet Mass')
polynomial_regression_analysis(df, ['pl_rade', 'pl_orbsmax'], 'pl_bmasse', 2, ['Planet Radius', 'Semi-Major Axis'], 'Planet Mass')
random_forest_regression_analysis(df, ['pl_rade', 'pl_orbsmax'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis'], 'Planet Mass', n_estimators=100, show_feature_importance=False, log=False)  
neural_network_regression(df, ['pl_rade', 'pl_orbsmax'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis'], 'Planet Mass', log=False)
gradient_boosting_regression_analysis(df, ['pl_rade', 'pl_orbsmax'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis'], 'Planet Mass', n_estimators=100, learning_rate=0.1, log=False)

Multiple Linear Regression Analysis
Mean Squared Error: 1761567.5182650192
R-squared: -1.2780374619588435
Polynomial Regression (Degree = 2):
Mean Squared Error: 93728155.88291742
R-squared: -120.20809910930791
Random Forest Regression:
Mean Squared Error: 712905.2909538792
R-squared: 0.07807964055722372
Neural Network Regression:
Mean Squared Error: 785858.5270230457
R-squared: -0.016262587608047108
Gradient Boosting Regression:
Mean Squared Error: 697842.8492984007
R-squared: 0.09755820496305723


In [6]:
# Predictor Variables: Radius, Mass, Temperature, 
# Response Variable: Eccentricity

multiple_linear_regression_analysis(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity')
polynomial_regression_analysis(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', 2, ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity')
random_forest_regression_analysis(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity', n_estimators=100, show_feature_importance=False, log=False)  
neural_network_regression(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity', log=False)
gradient_boosting_regression_analysis(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity', n_estimators=100, learning_rate=0.1, log=False)

Multiple Linear Regression Analysis
Mean Squared Error: 0.018362028864829374
R-squared: 0.1403462386933113
Polynomial Regression (Degree = 2):
Mean Squared Error: 0.018517262826907042
R-squared: 0.1330786616534878
Random Forest Regression:
Mean Squared Error: 0.018118664151192124
R-squared: 0.1517398266779324
Neural Network Regression:
Mean Squared Error: 0.4162216476799651
R-squared: -18.486218412971343
Gradient Boosting Regression:
Mean Squared Error: 0.018591379710852386
R-squared: 0.12960873692300323


In [11]:
# Predictor Variables: Planet Radius, Semi-Major Axis, Planet Mass,Stellar Metallicity, Stellar Mass
# Response Variable: Planet Temperature
'''
# High Correlation
'''

multiple_linear_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_bmasse', 'st_met', 'st_mass'], 'pl_eqt', ['Planet Radius', 'Semi-Major Axis', 'Planet Mass', 'Stellar Metallicity', 'Stellar Mass'], 'Planet Temperature')
polynomial_regression_analysis(df, ['pl_rade', 'pl_bmasse', 'pl_eqt'], 'pl_orbeccen', 2, ['Planet Radius', 'Planet Mass', 'Planet Temperature'], 'Eccentricity')
random_forest_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_bmasse', 'st_met', 'st_mass'], 'pl_eqt', ['Planet Radius', 'Semi-Major Axis', 'Planet Mass', 'Stellar Metallicity', 'Stellar Mass'], 'Planet Temperature', n_estimators=100, show_feature_importance=False, log=False)  
neural_network_regression(df, ['pl_rade', 'pl_orbsmax', 'pl_bmasse', 'st_met', 'st_mass'], 'pl_eqt', ['Planet Radius', 'Semi-Major Axis', 'Planet Mass', 'Stellar Metallicity', 'Stellar Mass'], 'Planet Temperature', log=False)
gradient_boosting_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_bmasse', 'st_met', 'st_mass'], 'pl_eqt', ['Planet Radius', 'Semi-Major Axis', 'Planet Mass', 'Stellar Metallicity', 'Stellar Mass'], 'Planet Temperature', n_estimators=100, learning_rate=0.1, log=False)

Multiple Linear Regression Analysis
Mean Squared Error: 124797.79400127879
R-squared: 0.5354804103411963
Polynomial Regression (Degree = 2):
Mean Squared Error: 0.018517262826907042
R-squared: 0.1330786616534878
Random Forest Regression:
Mean Squared Error: 17273.117504977377
R-squared: 0.935706383916865
Neural Network Regression:
Mean Squared Error: 451726.2074242133
R-squared: -0.6814053019933435
Gradient Boosting Regression:
Mean Squared Error: 14605.996448430697
R-squared: 0.9456338829457719


In [8]:
# Predictor Variables: Radius, Semi-Major Axis, Planet Temperature, Eccentricity  
# Response Variable: Mass

multiple_linear_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis', 'Planet Temperature', 'Eccentricity'], 'Planet Mass')
polynomial_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen'], 'pl_bmasse', 2, ['Planet Radius', 'Semi-Major Axis', 'Planet Temperature', 'Eccentricity'], 'Planet Mass')
random_forest_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis', 'Planet Temperature', 'Eccentricity'], 'Planet Mass', n_estimators=100, show_feature_importance=False, log=False)  
neural_network_regression(df, ['pl_rade', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis', 'Planet Temperature', 'Eccentricity'], 'Planet Mass', log=False)
gradient_boosting_regression_analysis(df, ['pl_rade', 'pl_orbsmax', 'pl_eqt', 'pl_orbeccen'], 'pl_bmasse', ['Planet Radius', 'Semi-Major Axis', 'Planet Temperature', 'Eccentricity'], 'Planet Mass', n_estimators=100, learning_rate=0.1, log=False)

Multiple Linear Regression Analysis
Mean Squared Error: 865395.3113869954
R-squared: 0.1128396086327208
Polynomial Regression (Degree = 2):
Mean Squared Error: 761523.7120068332
R-squared: 0.21932362529599247
Random Forest Regression:
Mean Squared Error: 614088.0958580723
R-squared: 0.3704673132764311
Neural Network Regression:
Mean Squared Error: 882741.6089391984
R-squared: 0.09505704392189351
Gradient Boosting Regression:
Mean Squared Error: 855716.59451248
R-squared: 0.12276174957495256
